In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import re
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [3]:
#Emoji Removal

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [4]:
#Cleaning basic Characters

def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [5]:
def remove_num(texts):
    output = re.sub(r'\d+', '', texts )
    return output

In [6]:
def unify_whitespaces(text):
    cleaned_string = re.sub(' +', ' ', text )
    return cleaned_string

In [7]:
def remove_punctuation(text):
    result = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',',') )
    return result

In [43]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('stopwords')
from nltk.stem import PorterStemmer

stop = set(stopwords.words('english')).union({'unsubscribe', 'min', 'answer', 'question', 'please', 'email', 'web', 'see', 'link', 'click'})
#add unsubscribe to stop
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop ]
    return ' '.join(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
def preprocess(line):
    line=re.sub(r'http\S+', '', line.strip())
    line = re.sub(r'[!"#$%&\'()*+,-./:;?@[\\\]^_{|}~`]', '', line)
    line=deEmojify(line)
    line=clean(line)
    line=remove_num(line)
    line=unify_whitespaces(line)
    line=remove_punctuation(line)
    line=remove_stopwords(line)
    return line


In [45]:
file_path = './emails1.csv'
lines = []
with open(file_path, 'r', encoding = 'utf-8') as file:
    for line in file:
        if line.strip()!='""':
            line = preprocess(line)
            lines.append(line)
lines=lines[1:]
print(len(lines))
print(lines[1])

3342


In [46]:
corpus_embeddings = embedder.encode(lines)

In [47]:
corpus_embeddings[1]

array([-2.26535425e-02,  9.38225746e-01,  4.56789315e-01,  8.51018429e-02,
       -5.83606958e-02, -5.79792202e-01, -5.01058519e-01, -2.00674653e-01,
        5.04988909e-01, -7.44988441e-01, -7.70888627e-01,  9.59893107e-01,
       -2.03960165e-01, -2.09418014e-02,  1.08120605e-01,  4.50182140e-01,
        1.57556564e-01, -1.18169628e-01, -8.91938269e-01,  3.85979116e-01,
        7.82972276e-01,  3.79904211e-01, -2.92634219e-03,  4.15963009e-02,
        4.01973277e-02,  5.07716358e-01, -3.03154141e-02,  4.39522713e-02,
        5.94450355e-01,  5.27402043e-01,  1.27560997e+00, -9.97093678e-01,
       -3.34229439e-01, -1.83697104e-01,  2.66562343e-01,  9.32723880e-02,
       -9.38415706e-01,  3.61560225e-01,  7.80448854e-01, -7.45326459e-01,
       -4.01485741e-01,  6.71505928e-02,  7.50741720e-01, -2.24958047e-01,
        2.04657733e-01,  1.62237227e-01,  2.12466344e-02, -1.14057248e-03,
       -1.03113997e+00,  9.79202986e-01,  1.90775067e-01, -9.04165626e-01,
       -5.24218678e-01, -

In [67]:
# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [68]:
comment = 'internship offered by company'
preprocessed = preprocess(comment)
test=embedder.encode([preprocessed])
clustering_model.predict(test)[0]

1

In [69]:
cluster_assignment

array([1, 1, 0, ..., 1, 2, 1])

In [70]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(lines[sentence_id])



In [71]:
print(clustered_sentences[0][0])

weeks theme skunk words words avoid mosey mohzee verb intr move leisurely manner leave quickly uncertain origin earliest documented use see usage examples vocabularycoms dictionary illustration anu garg ai everyone moseys hurry get anywhere sandra block happened night sourcebooks took money moseyed one stopped phil brody holden age hollywood medallion sponsored played orijinz players aged much fun im hooked orijinz fun fascinating word phrase origins game sky everything else weather pema chodron buddhist nun author b jul looking wordquotation archives change address etc pronunciation permalink


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the clustered sentences into separate documents for TF-IDF analysis
clustered_documents = [' '.join(cluster) for cluster in clustered_sentences]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the clustered documents
tfidf_matrix = tfidf_vectorizer.fit_transform(clustered_documents)

# Get the feature names (words) from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# For each cluster, find the top N keywords
num_keywords = 5  # You can adjust this value as needed
cluster_labels = []

for i, cluster_indices in enumerate(tfidf_matrix):
    # Get the indices of the top N keywords
    feature_index = cluster_indices.toarray().argsort()[0, -num_keywords:]
    cluster_keywords = [feature_names[idx] for idx in feature_index]
    cluster_labels.append(cluster_keywords)

# Print the cluster labels
for i, labels in enumerate(cluster_labels):
    print(f"Cluster {i + 1} - Labels: {', '.join(labels)}")


Cluster 1 - Labels: visit, new, one, like, read
Cluster 2 - Labels: learn, ai, data, ieee, read
Cluster 3 - Labels: aayush, pesu, pes, university, linkedin
Cluster 4 - Labels: account, आपक, और, broker, कर
Cluster 5 - Labels: version, link, take, click, message
