In [17]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import re
import nltk

In [18]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [19]:
#Emoji Removal

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [20]:
#Cleaning basic Characters

def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [21]:
def remove_num(texts):
    output = re.sub(r'\d+', '', texts )
    return output

In [22]:
def unify_whitespaces(text):
    cleaned_string = re.sub(' +', ' ', text )
    return cleaned_string

In [23]:
def remove_punctuation(text):
    result = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',',') )
    return result

In [24]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('stopwords')
from nltk.stem import PorterStemmer

stop = set(stopwords.words('english'))
#add unsubscribe to stop
stop.add('unsubscribe')
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop ]
    return ' '.join(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
file_path = './emails.csv'
lines = []
with open(file_path, 'r') as file:
    for line in file:
        if line.strip()!='""':
            line=re.sub(r'http\S+', '', line.strip())
            line = re.sub(r'[!"#$%&\'()*+,-./:;?@[\\\]^_{|}~`]', '', line)
            line=deEmojify(line)
            line=clean(line)
            line=remove_num(line)
            line=unify_whitespaces(line)
            line=remove_punctuation(line)
            line=remove_stopwords(line)
            lines.append(line)
lines=lines[1:]
print(len(lines))
print(lines[1])

780
dear innovator congratulations delighted inform cohort idea successfully reserved place prototyping stage click see result achievement fills us great satisfaction wholeheartedly celebrate progress firmly believe innovative idea potential bring meaningful change society committed supporting journey next step exciting process curated series captivating opportunities designed ensure smooth productive start prototyping phase cordially invite orientation session provide comprehensive insights prototyping stage well overview forthcoming events mentoring programs resources intended guide towards structuring advancing initiative session details date september time pm pm platform webex confirm attendance orientation session kindly click button register registration link following orientation session keep informed additional activities aimed enhancing experience prototyping stage including oneonone mentorship access resources prototype development engaging events encourage bring questions id

In [26]:
corpus_embeddings = embedder.encode(lines)

In [27]:
corpus_embeddings[1]

array([ 2.66916364e-01,  9.02335823e-01,  5.80179811e-01, -8.75559211e-01,
        1.10951275e-01, -3.26939300e-02, -3.92728150e-01, -1.14493787e-01,
        1.80045590e-01, -7.06154466e-01, -2.27512732e-01,  6.00761354e-01,
        2.77830124e-01,  3.91607314e-01,  4.30928946e-01, -5.60367107e-01,
        7.00924814e-01, -1.11849681e-01, -4.74754483e-01, -1.38676725e-03,
        5.54225087e-01,  2.42525309e-01, -2.79286414e-01, -1.36285037e-01,
        2.58695066e-01, -1.28557354e-01, -5.45186363e-03,  4.45697248e-01,
        7.07761884e-01,  5.33519745e-01,  3.38084638e-01, -2.18392774e-01,
       -1.51028648e-01, -5.36715649e-02,  3.19276601e-02,  6.05338395e-01,
       -6.91890955e-01, -1.74998119e-01, -6.46432400e-01, -1.17689796e-01,
       -1.24816500e-01, -2.24449947e-01,  8.42071414e-01, -1.33400798e-01,
       -1.63862631e-01,  4.60637897e-01, -6.32843852e-01,  2.65991747e-01,
       -7.38031507e-01,  3.89719754e-01, -2.80431747e-01, -1.04986858e+00,
       -8.12514663e-01, -

In [28]:
# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [41]:
test=embedder.encode(['just to check if it works'])
clustering_model.predict(test)[0]

4

In [30]:
cluster_assignment

array([1, 1, 3, 0, 0, 3, 1, 1, 0, 1, 1, 1, 0, 4, 0, 4, 4, 1, 3, 2, 1, 1,
       1, 4, 0, 3, 4, 1, 0, 3, 0, 2, 4, 1, 3, 3, 0, 1, 4, 0, 3, 3, 0, 1,
       1, 1, 0, 0, 1, 2, 1, 3, 0, 3, 0, 0, 4, 1, 2, 3, 3, 1, 0, 3, 3, 0,
       1, 1, 3, 1, 0, 1, 0, 3, 4, 4, 1, 3, 1, 3, 0, 0, 3, 0, 4, 0, 1, 0,
       0, 3, 0, 3, 1, 1, 1, 0, 1, 2, 0, 0, 1, 3, 1, 4, 1, 1, 1, 0, 3, 0,
       3, 3, 1, 1, 0, 1, 0, 3, 0, 2, 0, 2, 2, 2, 0, 4, 3, 0, 1, 4, 3, 2,
       0, 4, 0, 1, 3, 0, 0, 3, 3, 1, 1, 0, 3, 1, 1, 2, 4, 0, 1, 0, 2, 0,
       0, 0, 4, 0, 1, 1, 3, 3, 3, 1, 0, 1, 3, 0, 1, 1, 0, 3, 4, 0, 3, 1,
       3, 3, 0, 3, 2, 1, 2, 3, 0, 1, 1, 1, 1, 0, 3, 3, 3, 1, 1, 0, 0, 4,
       3, 0, 1, 3, 3, 2, 1, 2, 2, 0, 0, 0, 3, 4, 1, 0, 0, 0, 3, 1, 0, 0,
       0, 1, 4, 1, 1, 3, 3, 0, 0, 3, 1, 1, 4, 0, 2, 1, 1, 1, 0, 1, 3, 0,
       0, 0, 4, 1, 1, 1, 3, 3, 0, 2, 1, 1, 0, 0, 0, 3, 3, 1, 0, 1, 0, 0,
       1, 4, 0, 2, 2, 0, 0, 4, 0, 3, 1, 4, 0, 4, 4, 2, 0, 4, 3, 1, 3, 3,
       0, 1, 0, 3, 0, 3, 1, 3, 0, 0, 3, 0, 1, 4, 3,

In [31]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(lines[sentence_id])



In [32]:
print(clustered_sentences[0][0])

weeks theme words ai usage examples secundan sekuhnduhn adjective occurring every day latin secundus second earliest documented use c water conservation ad brisbane australia photo tamara forsooth sir henry hath adopted curious habit taking long walks secundan morrows sun offends next shines please william shakespeare tales times oddities stratford sentinel england jun aigenerated usage example todays sponsor orijinz fabulous game laughed laughed hours play family friends smash much fun wealth passed merit bad luck seen bad character ideologues justify punishing sick poor poverty neither crime character flaw stigmatize let people die struggle live sarah kendzior journalist author b sep looking wordquotation archives change address etc pronunciation permalink


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the clustered sentences into separate documents for TF-IDF analysis
clustered_documents = [' '.join(cluster) for cluster in clustered_sentences]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the clustered documents
tfidf_matrix = tfidf_vectorizer.fit_transform(clustered_documents)

# Get the feature names (words) from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# For each cluster, find the top N keywords
num_keywords = 5  # You can adjust this value as needed
cluster_labels = []

for i, cluster_matrix in enumerate(tfidf_matrix):
    feature_index = cluster_matrix.toarray().argsort()[:, -num_keywords:][0]
    cluster_keywords = [feature_names[idx] for idx in feature_index]
    cluster_labels.append(cluster_keywords)

# Print the cluster labels
for i, labels in enumerate(cluster_labels):
    print(f"Cluster {i + 1} - Labels: {', '.join(labels)}")


Cluster 1 - Labels: new, answer, min, question, read
Cluster 2 - Labels: stipend, students, read, internship, inr
Cluster 3 - Labels: web, link, take, click, message
Cluster 4 - Labels: email, pes, university, linkedin, pesu
Cluster 5 - Labels: securities, और, आपक, broker, कर
