In [87]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import re

In [88]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [89]:
#Emoji Removal

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [90]:
#Cleaning basic Characters

def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [91]:
def remove_num(texts):
    output = re.sub(r'\d+', '', texts )
    return output

In [92]:
def unify_whitespaces(text):
    cleaned_string = re.sub(' +', ' ', text )
    return cleaned_string

In [93]:
def remove_punctuation(text):
    result = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',',') )
    return result

In [94]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('stopwords')
from nltk.stem import PorterStemmer

stop = set(stopwords.words('english'))
#add unsubscribe to stop
stop.add('unsubscribe')
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop ]
    return ' '.join(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [95]:
file_path = './emails.csv'
lines = []
with open(file_path, 'r') as file:
    for line in file:
        if line.strip()!='""':
            line=re.sub(r'http\S+', '', line.strip())
            line=deEmojify(line)
            line=clean(line)
            line=remove_num(line)
            line=unify_whitespaces(line)
            line=remove_punctuation(line)
            line=remove_stopwords(line)
            lines.append(line)
lines=lines[1:]
print(len(lines))
print(lines[0])

780
hello innovator told could work dream company straight college yes heard right participate accenture innovation challenge win big may also get opportunity fast-track recruitment process journey accenture application link roles offered associate software engineer system application services associate rewards ❖ prizes worth inr per team member❖ fast-track interview opportunity eligibility undergraduate postgraduate students participate sought-after challenge let ideas make waves note fast-track recruitment process offered eligible (as per recruitment process criteria) participants would submit innovative ideas eligible participants receive communications recruitment process accenture best regardsteam unstop


In [96]:
corpus_embeddings = embedder.encode(lines)

In [97]:
corpus_embeddings[1]

array([ 7.87753463e-02,  8.96571755e-01,  6.46611452e-01, -6.15763426e-01,
        2.12748684e-02, -1.56943709e-01, -5.28815687e-01, -2.97446668e-01,
        1.78253531e-01, -5.10646224e-01, -4.08641338e-01,  7.22545445e-01,
        6.81334585e-02,  3.41246009e-01,  3.61969829e-01, -3.41322213e-01,
        6.17094100e-01, -9.93694961e-02, -5.43711662e-01,  1.64684743e-01,
        3.63745153e-01,  2.89227307e-01, -2.72168189e-01, -2.25215003e-01,
        3.35118651e-01,  1.30568713e-01, -1.05810925e-01,  3.51834863e-01,
        5.69331288e-01,  3.40426832e-01,  4.46420670e-01, -1.23607270e-01,
       -1.61109328e-01, -2.50078052e-01,  2.50546575e-01,  4.17265922e-01,
       -3.49227518e-01, -2.12323666e-02, -2.44773567e-01, -1.36963904e-01,
        9.68858898e-02,  7.06743449e-04,  8.79492044e-01,  2.03769952e-01,
        8.93196315e-02,  4.35422182e-01, -7.30022013e-01,  2.81150937e-01,
       -6.57905757e-01,  1.89066112e-01, -4.77365851e-02, -1.21177423e+00,
       -6.29529536e-01, -

In [98]:
# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [99]:
cluster_assignment

array([3, 3, 3, 1, 0, 0, 3, 3, 1, 1, 3, 3, 1, 0, 1, 0, 0, 3, 2, 4, 3, 3,
       3, 0, 1, 0, 0, 3, 1, 2, 1, 4, 0, 3, 2, 2, 0, 3, 0, 1, 2, 2, 1, 3,
       3, 3, 1, 1, 2, 4, 3, 2, 2, 2, 1, 0, 0, 3, 4, 2, 3, 3, 1, 2, 2, 0,
       3, 3, 0, 3, 1, 3, 0, 0, 0, 0, 3, 2, 2, 2, 1, 1, 2, 1, 0, 2, 3, 1,
       1, 2, 1, 0, 3, 3, 3, 0, 3, 4, 1, 1, 3, 0, 3, 0, 3, 2, 3, 1, 2, 1,
       0, 0, 3, 3, 1, 3, 3, 0, 0, 4, 3, 4, 4, 4, 1, 2, 0, 3, 3, 0, 2, 4,
       1, 0, 1, 3, 2, 1, 1, 0, 2, 3, 3, 1, 2, 3, 3, 4, 0, 1, 3, 1, 4, 1,
       3, 1, 2, 1, 3, 3, 0, 0, 0, 3, 0, 3, 2, 1, 3, 3, 1, 3, 0, 1, 2, 3,
       2, 2, 2, 2, 4, 3, 4, 2, 1, 3, 3, 3, 3, 1, 2, 0, 2, 3, 3, 1, 0, 1,
       0, 3, 3, 2, 2, 4, 3, 4, 4, 1, 1, 1, 2, 0, 3, 1, 0, 1, 3, 1, 1, 1,
       0, 3, 3, 3, 3, 3, 0, 2, 1, 2, 3, 3, 0, 1, 4, 3, 3, 3, 3, 3, 2, 1,
       1, 1, 2, 2, 3, 3, 2, 0, 1, 4, 3, 3, 3, 0, 1, 2, 2, 3, 1, 3, 1, 1,
       3, 0, 1, 4, 4, 0, 1, 0, 1, 2, 3, 0, 3, 0, 0, 4, 1, 0, 2, 3, 2, 2,
       1, 3, 1, 2, 2, 2, 3, 2, 1, 1, 2, 1, 3, 0, 0,

In [100]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(lines[sentence_id])



In [101]:
print(clustered_sentences[0][0])

hey aayush august visit splitwise see current debts ious settle friends comments thoughts suggestions hit reply let us know great day -the splitwise team splitwise inc union st suite providence ri usa


In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the clustered sentences into separate documents for TF-IDF analysis
clustered_documents = [' '.join(cluster) for cluster in clustered_sentences]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the clustered documents
tfidf_matrix = tfidf_vectorizer.fit_transform(clustered_documents)

# Get the feature names (words) from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# For each cluster, find the top N keywords
num_keywords = 5  # You can adjust this value as needed
cluster_labels = []

for i, cluster_matrix in enumerate(tfidf_matrix):
    feature_index = cluster_matrix.toarray().argsort()[:, -num_keywords:][0]
    cluster_keywords = [feature_names[idx] for idx in feature_index]
    cluster_labels.append(cluster_keywords)

# Print the cluster labels
for i, labels in enumerate(cluster_labels):
    print(f"Cluster {i + 1} - Labels: {', '.join(labels)}")


Cluster 1 - Labels: exchange, और, आपक, broker, कर
Cluster 2 - Labels: new, answer, min, question, read
Cluster 3 - Labels: chairperson, university, campus, linkedin, pesu
Cluster 4 - Labels: learn, stipend, inr, read, ai
Cluster 5 - Labels: version, see, click, recent, message
