In [29]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import re
import nltk

In [30]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [31]:
#Emoji Removal

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [32]:
#Cleaning basic Characters

def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [33]:
def remove_num(texts):
    output = re.sub(r'\d+', '', texts )
    return output

In [34]:
def unify_whitespaces(text):
    cleaned_string = re.sub(' +', ' ', text )
    return cleaned_string

In [35]:
def remove_punctuation(text):
    result = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',',') )
    return result

In [40]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('stopwords')
from nltk.stem import PorterStemmer

stop = set(stopwords.words('english'))
#add unsubscribe to stop
stop.add('unsubscribe')
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop ]
    return ' '.join(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
def preprocess(line):
    line=re.sub(r'http\S+', '', line.strip())
    line = re.sub(r'[!"#$%&\'()*+,-./:;?@[\\\]^_{|}~`]', '', line)
    line=deEmojify(line)
    line=clean(line)
    line=remove_num(line)
    line=unify_whitespaces(line)
    line=remove_punctuation(line)
    line=remove_stopwords(line)
    return line


In [43]:
file_path = './emails.csv'
lines = []
with open(file_path, 'r', encoding = 'utf-8') as file:
    for line in file:
        if line.strip()!='""':
            line = preprocess(line)
            lines.append(line)
lines=lines[1:]
print(len(lines))
print(lines[1])

780
dear innovator congratulations delighted inform cohort idea successfully reserved place prototyping stage click see result achievement fills us great satisfaction wholeheartedly celebrate progress firmly believe innovative idea potential bring meaningful change society committed supporting journey next step exciting process curated series captivating opportunities designed ensure smooth productive start prototyping phase cordially invite orientation session provide comprehensive insights prototyping stage well overview forthcoming events mentoring programs resources intended guide towards structuring advancing initiative session details date september time pm pm platform webex confirm attendance orientation session kindly click button register registration link following orientation session keep informed additional activities aimed enhancing experience prototyping stage including oneonone mentorship access resources prototype development engaging events encourage bring questions id

In [44]:
corpus_embeddings = embedder.encode(lines)

In [45]:
corpus_embeddings[1]

array([ 2.66916424e-01,  9.02335405e-01,  5.80179989e-01, -8.75559330e-01,
        1.10951059e-01, -3.26939858e-02, -3.92728060e-01, -1.14493772e-01,
        1.80045143e-01, -7.06154287e-01, -2.27512747e-01,  6.00761592e-01,
        2.77830064e-01,  3.91607672e-01,  4.30928916e-01, -5.60366929e-01,
        7.00924754e-01, -1.11849427e-01, -4.74754661e-01, -1.38677470e-03,
        5.54225504e-01,  2.42525131e-01, -2.79286474e-01, -1.36284739e-01,
        2.58695215e-01, -1.28557712e-01, -5.45142591e-03,  4.45697188e-01,
        7.07761884e-01,  5.33519626e-01,  3.38084251e-01, -2.18392953e-01,
       -1.51028946e-01, -5.36711775e-02,  3.19273062e-02,  6.05338395e-01,
       -6.91891253e-01, -1.74998432e-01, -6.46431983e-01, -1.17689759e-01,
       -1.24816768e-01, -2.24450231e-01,  8.42071354e-01, -1.33400828e-01,
       -1.63862914e-01,  4.60637540e-01, -6.32844090e-01,  2.65991718e-01,
       -7.38031566e-01,  3.89720082e-01, -2.80431807e-01, -1.04986835e+00,
       -8.12514961e-01, -

In [46]:
# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [67]:
comment = 'Invest in your stocks today'
preprocessed = preprocess(comment)
test=embedder.encode([preprocessed])
clustering_model.predict(test)[0]

3

In [68]:
cluster_assignment

array([0, 0, 4, 1, 4, 4, 0, 0, 1, 0, 0, 0, 1, 3, 1, 3, 3, 0, 4, 2, 0, 0,
       4, 3, 1, 4, 3, 0, 1, 4, 1, 2, 3, 0, 4, 4, 1, 0, 3, 1, 4, 4, 1, 0,
       0, 0, 1, 1, 0, 2, 0, 4, 1, 4, 1, 4, 3, 4, 2, 4, 4, 0, 1, 4, 4, 1,
       0, 0, 4, 0, 1, 0, 1, 4, 3, 3, 0, 4, 0, 4, 1, 1, 4, 1, 3, 1, 0, 1,
       1, 4, 1, 4, 0, 0, 0, 1, 0, 2, 1, 1, 0, 4, 1, 3, 0, 0, 0, 1, 4, 1,
       4, 4, 0, 0, 1, 0, 1, 4, 1, 2, 1, 2, 2, 2, 1, 3, 4, 0, 0, 3, 4, 2,
       1, 3, 1, 0, 4, 1, 1, 4, 4, 0, 0, 1, 4, 0, 1, 2, 3, 1, 0, 1, 2, 1,
       1, 1, 3, 1, 0, 0, 4, 4, 4, 0, 4, 0, 4, 1, 0, 0, 1, 4, 3, 1, 4, 0,
       4, 4, 1, 4, 2, 0, 2, 4, 1, 1, 0, 0, 0, 1, 0, 4, 4, 0, 0, 1, 1, 3,
       4, 1, 0, 4, 4, 2, 0, 2, 2, 1, 1, 1, 4, 3, 0, 1, 1, 1, 4, 0, 1, 1,
       1, 0, 3, 0, 0, 4, 4, 1, 1, 4, 0, 0, 3, 1, 2, 0, 0, 0, 1, 0, 4, 1,
       1, 1, 3, 0, 0, 0, 4, 4, 1, 2, 0, 0, 1, 1, 1, 4, 4, 0, 1, 0, 1, 1,
       0, 3, 1, 2, 2, 1, 1, 3, 1, 4, 0, 3, 1, 3, 3, 2, 1, 3, 4, 0, 4, 4,
       1, 0, 1, 4, 1, 4, 0, 4, 1, 1, 4, 1, 0, 3, 4,

In [69]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(lines[sentence_id])



In [70]:
print(clustered_sentences[0][0])

hello innovator told could work dream company straight college yes heard right participate accenture innovation challenge win big may also get opportunity fasttrack recruitment process journey accenture application link roles offered associate software engineer system application services associate rewards ❖ prizes worth inr per team member❖ fasttrack interview opportunity eligibility undergraduate postgraduate students participate soughtafter challenge let ideas make waves note fasttrack recruitment process offered eligible per recruitment process criteria participants would submit innovative ideas eligible participants receive communications recruitment process accenture best regardsteam unstop


In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the clustered sentences into separate documents for TF-IDF analysis
clustered_documents = [' '.join(cluster) for cluster in clustered_sentences]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the clustered documents
tfidf_matrix = tfidf_vectorizer.fit_transform(clustered_documents)

# Get the feature names (words) from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# For each cluster, find the top N keywords
num_keywords = 5  # You can adjust this value as needed
cluster_labels = []

for i, cluster_indices in enumerate(tfidf_matrix):
    # Get the indices of the top N keywords
    feature_index = cluster_indices.toarray().argsort()[0, -num_keywords:]
    cluster_keywords = [feature_names[idx] for idx in feature_index]
    cluster_labels.append(cluster_keywords)

# Print the cluster labels
for i, labels in enumerate(cluster_labels):
    print(f"Cluster {i + 1} - Labels: {', '.join(labels)}")


Cluster 1 - Labels: stipend, students, read, internship, inr
Cluster 2 - Labels: new, answer, min, question, read
Cluster 3 - Labels: web, link, take, click, message
Cluster 4 - Labels: exchange, और, आपक, broker, कर
Cluster 5 - Labels: email, pes, university, linkedin, pesu


In [78]:
for i in cluster_labels:
    print(i)

['stipend', 'students', 'read', 'internship', 'inr']
['new', 'answer', 'min', 'question', 'read']
['web', 'link', 'take', 'click', 'message']
['exchange', 'और', 'आपक', 'broker', 'कर']
['email', 'pes', 'university', 'linkedin', 'pesu']
