# Retrieving the Mail

In [None]:
import os.path
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [None]:
# If modifying these scopes, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
session=requests.session()

def save_html(response, path):
    html_content = response.content
    with open(path, 'wb') as html_file:
        html_file.write(html_content)
    print(f"HTML content saved to '{path}'")

def authenticate():
    creds = None
    if os.path.exists("token.json"):
        print("User token found")
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file("client_secret.json", SCOPES)
            creds = flow.run_local_server(port=0)
        with open("token.json", "w") as token:
            token.write(creds.to_json())
    print("Authenticated")
    return creds

def get_message_ids():
    global creds
    global parameters
    message_ids = []
    while True:
        # Move across pages
        response_message_getter= session.get(message_url, params=parameters, headers={'Authorization': f'Bearer {creds.token}'})
        save_html(response_message_getter, 'json.html')

        # Taking out messages on each page
        messages = response_message_getter.json().get('messages', [])
        print(len(messages))

        next_page_token = response_message_getter.json().get('nextPageToken')

        print('messages\n', messages, '\nnext page token\n', next_page_token)
        with open ('message_json.txt', 'a') as f:
            for message in messages:
                message_ids.append(message['id'] )
                f.write(f'{message}\n')
        if next_page_token:
            parameters['pageToken'] = next_page_token
            print(parameters)
        else:
            break
    print('writing to ids file')
    with open ('message_ids.txt', 'a') as f:
        for message in message_ids:
            f.write(f'{message}\n')
    return message_ids

def get_a_message(message_id):
    global message_count
    message_count += 1
    print(f"Retrieved {message_count} messages.")
    url = f'https://gmail.googleapis.com/gmail/v1/users/{userId}/messages/{message_id}'
    params = {'format': 'full'}
    response = requests.get(url, params=params, headers={'Authorization': f'Bearer {creds.token}'})
    save_html(response, f'root/messages/message{message_id}.json')



In [None]:

#main loop
creds = authenticate()
with open('message_ids.txt', 'w') as f:
    f.close()
with open('message_json.txt', 'w') as f:
    f.close()
message_count=10
try:
    service = build('gmail', 'v1', credentials=creds)
    userId = 'arehman.bee22seecs@seecs.edu.pk'
    message_url = f'https://gmail.googleapis.com/gmail/v1/users/{userId}/messages'
    parameters = {'maxResults': 500}
    message_ids=get_message_ids()
    print(len(message_ids))
    print("done")
    threads = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures=[executor.submit(get_a_message, message_id) for message_id in message_ids]
        for future in as_completed(futures):
            print(future.result())

except HttpError as error:
    print(f"An error occurred: {error}")

# **Pre-processing Raw Mail response**

## MIME to text level processing

In [None]:
import os
import json
import base64
from bs4 import BeautifulSoup

#stats for summarizing mime types
total_messages_viewed=0
others=[]
single_part_count=0
multi_part_count=0
plain_text_count=0
text_html_count=0
others_count=0

to_decode_plain_text=[]
to_decode_html=[]
#payload->headers->subject
os.makedirs("root/messages/Distilled", exist_ok=True)
def print_text_body(path):
    with open(f"root/messages/Distilled/{path.split('/')[2]}", 'w', encoding='utf-8') as f:
        pass
    global single_part_count,multi_part_count,plain_text_count,text_html_count,others_count,total_messages_viewed,to_decode_plain_text,to_decode_html

    total_messages_viewed+=1
    to_decode_html = []#emptying the dictionary before populating it
    print("PATHHHHHHHHHHHHHHH",path)
    with open(path,'r',encoding='utf-8') as msg:
        message=json.load(msg)


    #adding the subject
    subject_found = False  # Initialize flag variable
    for header in message['payload']['headers']:
        if header['name'] == 'Subject':
            text = header['value']
            print("Subject:", text)
            with open(f"root/messages/Distilled/{path.split('/')[2]}", 'a', encoding='utf-8') as f:
                f.write(text)
            subject_found = True  # Set flag to True if subject header is found
    # Check flag and print message if subject is not found
    if not subject_found:
        print("No subject for this one:", path.split('/')[2])

    #multipart case
    if 'parts' in message['payload']:
        multi_part_count+=1
        for part in message['payload']['parts']:
            process_a_single_part(part)
    #single part case
    else:
        single_part_count+=1
        part=message['payload']
        process_a_single_part(part)
    #decode and save html part
    for encoded_html in to_decode_html:
        html= base64.urlsafe_b64decode(encoded_html).decode('utf-8')
        soup_object= BeautifulSoup(html,'html.parser')
        text=soup_object.get_text(separator='\n', strip=True)
        with open(f"root/messages/Distilled/{path.split('/')[2]}",'a',encoding='utf-8') as f:
            f.write(text)

    #decode and save plain text part
    msg_str=""
    to_decode_plain_text=[]#empty the dictionary
    for encoded in to_decode_plain_text:
        msg_str+=base64.urlsafe_b64decode(encoded).decode('utf-8')
    with open(f"root/messages/Distilled/{path.split('/')[2]}", 'a', encoding='utf-8') as f:
        f.write("\n*************************************************************************\n\n\n\n*************************************************************************")
        f.write(msg_str)
    print
def process_a_single_part(part):
    global single_part_count, multi_part_count, plain_text_count, text_html_count, others_count, total_messages_viewed, to_decode_plain_text
    if part['mimeType'].startswith('text/'):
        print(part['mimeType'])
        #print(part['body'])#commented out cuz big mess in CLI
        if part['mimeType'].endswith('html'):#include both amp and regular html
            if 'data' in part['body']:
                to_decode_html.append(part['body']['data'])
            text_html_count += 1
        elif part['mimeType'].endswith('plain'):
            if 'data' in part['body']:
                to_decode_plain_text.append(part['body']['data'])
            plain_text_count += 1
        else:
            others_count += 1
            others.append(part['mimeType'])

files_stack=[]
for file in os.listdir('root/messages'):
        if file.endswith('json'):
          print(file,":******************************************************************************")
          print_text_body('root/messages/'+file)


print("Total number of messages viewed: ",total_messages_viewed)

print("Single part messages count: ",single_part_count)
print("Multi part messages count: ",multi_part_count)

print("Messages with plain text: ", plain_text_count)
print("Messages with html text: ", text_html_count)

print("Messages with text but not html or plain: ",others_count)
print("their list: ",others)

## Text Pre-processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import os
import re
from gensim.models import KeyedVectors
import numpy as np
import gensim.downloader as api
model2 = api.load('word2vec-google-news-300')

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize NLTK objects
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define the directory containing distilled text data
distilled_dir = "root/messages/Distilled"

# Create the output directory for preprocessed data
output_dir = "Corpus"
os.makedirs(output_dir, exist_ok=True)

# Define a function for text preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    content = text.split()
    filtered_content = [word for word in content if word in model2]
    textf1=" ".join(filtered_content)
    textf1 = re.sub(r'[^a-zA-Z\s]', ' ', textf1)
    # Reduce multiple whitespaces to a single whitespace
    textf1 = re.sub(r'\s+', ' ', textf1)
    # Tokenize the text
    tokens = word_tokenize(textf1)
    # Remove stopwords and perform lemmatization
    preprocessed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
    # Join the tokens back into a string
    preprocessed_text = ' '.join(preprocessed_tokens)
    print(preprocessed_text)
    return preprocessed_text

In [None]:
# Process each file in the Distilled directory
whole_corpus=''
# Define a list to store filenames with empty preprocessed contents
empty_files = []

# Process each file in the Distilled directory
for file in os.listdir(distilled_dir):
    with open(os.path.join(distilled_dir, file), encoding="utf-8") as f:
        print("Processing file:", file)
        contents = f.read()
        # Perform preprocessing on the contents
        preprocessed_contents = preprocess_text(contents)
        # Check if the preprocessed content is empty
        if not preprocessed_contents.strip():
            # If the preprocessed content is empty, note the filename
            empty_files.append(file)
            print("Empty content for file:", file)
            continue  # Skip saving the file
        # Write the preprocessed contents to a new file in the output directory
        output_file_path = os.path.join(output_dir, file)
        with open(output_file_path, 'w', encoding="utf-8") as output_file:
            output_file.write(preprocessed_contents)
            whole_corpus+=preprocessed_contents
        print("Preprocessed contents saved to:", output_file_path)
with open(os.path.join(output_dir,'The_whole_corpus.txt'),'w',encoding='utf-8')as f:
  f.write(whole_corpus)

# Print count of files with empty preprocessed contents and their names
print("Number of files with empty preprocessed contents:", len(empty_files))
print("File names with empty preprocessed contents:", empty_files)

In [None]:
from gensim.models import KeyedVectors
import numpy as np
import gensim.downloader as api
model2 = api.load('word2vec-google-news-300')

### Document made

In [None]:
import os
mails=[]
for file in os.listdir('Corpus'):
    if file!='The_whole_corpus.txt':
        with open(f"Corpus/{(file)}",'r',encoding='utf-8') as f:
            mails.append(f.read())
            print("added: ",file)

## BoW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Create CountVectorizer for BoW representation
count_vectorizer = CountVectorizer()

# Fit and transform the corpus
bow_matrix = count_vectorizer.fit_transform(mails)

# Get the feature names (words)
feature_names = count_vectorizer.get_feature_names_out()

# Print BoW matrix
print(bow_matrix.toarray())

# Print feature names
print(feature_names)

# **Aggregation for K-means**

## Using word to vec summation and average

In [None]:
def sum_and_average_embeddings_of_a_sentence(mail):
    words = mail.split()
    words=list(set(words))
    embedding_sum = np.zeros(model2.vector_size)
    misses=0
    for word in words:
        if word in model2:
            embedding_sum += model2[word]
        else:
            #print(f"Word '{word}' not in vocabulary.")
            misses+=1
    print("words missed %:",misses*100/len(words))
    return embedding_sum / (len(words)-misses),embedding_sum

In [None]:
sum=[]
average=[]

directory = 'Corpus'
for file in os.listdir(directory):
    if file != 'The_whole_corpus.txt':
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            a,s=sum_and_average_embeddings_of_a_sentence(f.read())
            sum.append(s)
            average.append(a)

## Using tf-idf weighted average

In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors

# Read documents from files in the 'Corpus' directory
docs = []
directory = 'Corpus'
for file in os.listdir(directory):
    if file != 'The_whole_corpus.txt':
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            docs.append(f.read())
print(len(docs))
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF scores for the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

# Retrieve vocabulary and IDF scores
vocab = tfidf_vectorizer.get_feature_names_out()
idf_scores = tfidf_vectorizer.idf_

# Create a dictionary to map words to their indices in vocab
word_to_index = {word: index for index, word in enumerate(vocab)}

# Compute TF-IDF weighted average of word vectors for each document
document_vectors = []
zero_count=0
for doc in docs:
    # Tokenize document
    words = doc.split()
    # Initialize document vector
    doc_vector = np.zeros(300)  # Assume word vector dimension is 300
    for word in words:
        if word in model2:
            # Retrieve word vector
            word_vector = model2[word]
            # Compute TF-IDF score
            index = word_to_index.get(word)
            tfidf_score = idf_scores[index] if index is not None else 0.0
            # Weighted summation
            doc_vector += word_vector * tfidf_score
    # Check if document vector is not a zero vector
    if np.count_nonzero(doc_vector) > 0:
        # Normalize document vector
        doc_vector /= np.linalg.norm(doc_vector)
    else:
        # If the document vector is a zero vector, append it without normalization
        print("Warning: Zero vector encountered for document.")
        zero_count+=1
    # Add document vector to list
    document_vectors.append(doc_vector)


# Print document vectors
for i, vec in enumerate(document_vectors):
    print(f"Document {i+1} vector: {vec}")
print("Zero_count=",zero_count)

## Using Doc2vec

In [None]:
import os
from gensim.models.doc2vec import TaggedDocument

email_dir = "Corpus"

# List to store TaggedDocuments
tagged_data = []

# Iterate over each email file in the directory
for filename in os.listdir(email_dir):
    if filename != "The_whole_corpus.txt":
        filepath = os.path.join(email_dir, filename)

        # Read the contents of the email file
        with open(filepath, "r", encoding="utf-8") as file:
            preprocessed_text = file.read()

        print("Filename:", filename)
        print("Preprocessed Text:", preprocessed_text)
        print("------------------------------------------")

        # Create a TaggedDocument with the preprocessed text and filename as tag
        tagged_doc = TaggedDocument(words=preprocessed_text, tags=[filename])

        # Append the TaggedDocument to the list
        tagged_data.append(tagged_doc)

# Now you have a list of TaggedDocuments, each with the filename as its tag
# You can use this tagged_data to train your Doc2Vec model


In [None]:
from gensim.models.doc2vec import Doc2Vec
vector_size = 300  # Dimensionality of document vectors
window = 100  # Maximum distance between the current and predicted word
min_count = 1  # Ignore words with total frequency lower than this
epochs = 20 # Number of iterations over the corpus
# Initialize and train the Doc2Vec model using tagged_data
model = Doc2Vec(tagged_data, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)


# Save the trained model
model.save("doc2vec_model")

# Alternatively, if you want to load a pre-trained model
# model = Doc2Vec.load("doc2vec_model")


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to retrieve nearest neighbors for a given document
def retrieve_nearest_neighbors(doc_id, model, top_n=3):
    # Infer the vector representation of the query document
    query_vector = model.dv[doc_id]

    # Calculate cosine similarity between the query vector and all other document vectors
    similarities = cosine_similarity([query_vector], model.dv.vectors)

    # Get the indices of the top similar documents
    top_similar_indices = similarities.argsort()[0][-top_n-1:-1][::-1]  # Exclude the query document

    # Retrieve the filenames of the top similar documents
    top_similar_docs = [model.dv.index_to_key[idx] for idx in top_similar_indices]

    return top_similar_docs

# Example of how to retrieve nearest neighbors for a document
# Choose a document index from the corpus
query_doc_index = 23 # Adjust as needed

# Retrieve the nearest neighbors for the chosen document
nearest_neighbors = retrieve_nearest_neighbors(query_doc_index, model)

# Print the filenames of the nearest neighbors
print("Nearest neighbors for document:", model.dv.index_to_key[query_doc_index])
with open(f"Corpus/{(model.dv.index_to_key[query_doc_index])}",'r')as f:
        print(f.read())
for idx, neighbor in enumerate(nearest_neighbors, 1):
    print(f"{idx}. {neighbor}")
    with open(f"Corpus/{(neighbor)}",'r')as f:
        print(f.read())


## BoW

## TF-IDF

# Applying Kmeans

## Doc2vec

In [None]:
document_embeddings=[]
for i in range(len(model.dv)):
    document_embeddings.append(model.dv[i])
#print(document_embeddings)

### n_int=10, max_iter50

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

min_clusters = 6
max_clusters = 50
range_of_clusters = range(min_clusters, max_clusters + 1)

silhouette_scores = []
wcss_values = []

# Perform clustering for each number of clusters in the range
for num_clusters in range_of_clusters:
    print(num_clusters)
    # Initialize K-means clustering with the current number of clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=42,n_init=10,max_iter=50)

    # Perform clustering on the document embeddings
    cluster_labels = kmeans.fit_predict(document_embeddings)

    # Compute silhouette score
    silhouette_avg = silhouette_score(document_embeddings, cluster_labels)
    silhouette_scores.append(silhouette_avg)

    # Compute Within-Cluster Sum of Squares (WCSS)
    wcss = kmeans.inertia_
    wcss_values.append(wcss)

### Plotting

In [None]:
# Plot Silhouette Score
plt.figure(figsize=(10, 5))
plt.plot(range_of_clusters, silhouette_scores, marker='o')
plt.title('Silhouette Score vs. Number of Clusters')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.xticks(range(min_clusters, max_clusters + 1, 2))
plt.grid(True)
plt.show()

# Plot WCSS values
plt.figure(figsize=(10, 5))
plt.plot(range_of_clusters, wcss_values, marker='o')
plt.title('Within-Cluster Sum of Squares (WCSS) vs. Number of Clusters')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.xticks(range(min_clusters, max_clusters + 1, 2))
plt.grid(True)
plt.show()

In [None]:
# Plot Silhouette scores
plt.figure(figsize=(10, 5))
plt.plot(range_of_clusters, silhouette_scores, marker='o')
plt.title('Silhouette Score vs. Number of Clusters')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.xticks(range(min_clusters, max_clusters + 1, 2))
plt.grid(True)
plt.show()

# Plot WCSS values
plt.figure(figsize=(10, 5))
plt.plot(range_of_clusters, wcss_values, marker='o')
plt.title('Within-Cluster Sum of Squares (WCSS) vs. Number of Clusters')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.xticks(range(min_clusters, max_clusters + 1, 2))
plt.grid(True)
plt.show()

### CLuster using specific K-value

In [None]:
num_clusters = 50
kmeans = KMeans(n_clusters=num_clusters, random_state=42,n_init=50,max_iter=1000)
kmeans.fit(document_embeddings)

cluster_labels = kmeans.labels_

### Plotting in 2D space

In [None]:
from sklearn.decomposition import PCA

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)
document_vectors_2d = pca.fit_transform(document_embeddings)

# Plot the scatter plot
plt.figure(figsize=(8, 6))
for cluster in range(num_clusters):
    plt.scatter(document_vectors_2d[cluster_labels == cluster, 0],
                document_vectors_2d[cluster_labels == cluster, 1],
                label=f'Cluster {cluster+1}')
plt.title('K-means Clustering Results')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

### Reading the clusters and corresponding documents

In [None]:
for i in range (50):
  cluster_to_inspect = i
  documents_in_cluster = [doc for doc, label in zip(docs, cluster_labels) if label == cluster_to_inspect]

  for i, doc in enumerate(documents_in_cluster):
      print(f"Document {i+1} in Cluster {cluster_to_inspect+1}:")
      print(doc)
      print("-" * 50)

## Word2vec

### Tf-IDF weighted average

#### Without special parameters

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import concurrent.futures

# Range of clusters to try
min_clusters = 2
max_clusters = 1800
range_clusters = range(min_clusters, max_clusters + 1)

# Function to compute WCSS and Silhouette Score for a given number of clusters
def compute_metrics(num_clusters):
    print("Clustering: ", num_clusters, '/', max_clusters)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(document_vectors)
    wcss = kmeans.inertia_
    silhouette_avg = silhouette_score(document_vectors, kmeans.labels_)
    return num_clusters, wcss, silhouette_avg

# Lists to store evaluation metrics
wcss_values = []
silhouette_scores = []

# Multithreaded computation
max_workers = 4  # Set the maximum number of workers
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks
    futures = [executor.submit(compute_metrics, num_clusters) for num_clusters in range_clusters]
    # Retrieve results
    for future in concurrent.futures.as_completed(futures):
        num_clusters, wcss, silhouette_avg = future.result()
        wcss_values.append(wcss)
        silhouette_scores.append(silhouette_avg)

# Plot the Elbow Method and Silhouette Scores
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range_clusters, wcss_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method')

plt.subplot(1, 2, 2)
plt.plot(range_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')

plt.tight_layout()
plt.show()


##### Cluster using specific K-value

In [None]:

num_clusters = 250
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(document_vectors)

cluster_labels = kmeans.labels_

##### Plotting in 2D space

In [None]:

from sklearn.decomposition import PCA
# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)
document_vectors_2d = pca.fit_transform(document_vectors)

# Plot the scatter plot
plt.figure(figsize=(8, 6))
for cluster in range(num_clusters):
    plt.scatter(document_vectors_2d[cluster_labels == cluster, 0],
                document_vectors_2d[cluster_labels == cluster, 1],
                label=f'Cluster {cluster+1}')
plt.title('K-means Clustering Results')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

##### Read the clusters and corresponding documents

In [None]:
for i in range (250):
  cluster_to_inspect = i
  documents_in_cluster = [doc for doc, label in zip(docs, cluster_labels) if label == cluster_to_inspect]

  # Print or display the documents
  for i, doc in enumerate(documents_in_cluster):
      print(f"Document {i+1} in Cluster {cluster_to_inspect+1}:")
      print(doc)
      print("-" * 50)


#### Kmeans Parameters: n_int=10, max_iter=20

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import numpy as np
import concurrent.futures

# Range of clusters to try
min_clusters = 2
max_clusters = 250
range_clusters = range(min_clusters, max_clusters + 1)

# Function to compute WCSS, Silhouette Score, and Gap Statistic for a given number of clusters
def compute_metrics(num_clusters):
    print("Clustering: ", num_clusters, '/', max_clusters)
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=20,random_state=42)
    kmeans.fit(document_vectors)
    wcss = kmeans.inertia_
    silhouette_avg = silhouette_score(document_vectors, kmeans.labels_)
    print("clustered: ",num_clusters)
    return num_clusters, wcss, silhouette_avg


# Lists to store evaluation metrics
wcss_values = []
silhouette_scores = []

# Convert document_vectors to numpy array
#document_vectors = np.array(document_vectors)

# Multithreaded computation
max_workers = 8  # Set the maximum number of workers
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks
    futures = [executor.submit(compute_metrics, num_clusters) for num_clusters in range_clusters]
    # Retrieve results
    for future in concurrent.futures.as_completed(futures):
        num_clusters, wcss, silhouette_avg = future.result()
        wcss_values.append(wcss)
        silhouette_scores.append(silhouette_avg)


# Plot the Elbow Method, Silhouette Scores, and Gap Statistic
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
plt.plot(range_clusters, wcss_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method')

plt.subplot(1, 3, 2)
plt.plot(range_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')


plt.tight_layout()
plt.show()

##### Cluster using specific K-value

In [None]:
num_clusters = 250
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(document_vectors)

cluster_labels = kmeans.labels_

##### Plotting in 2D space

In [None]:
from sklearn.decomposition import PCA

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)
document_vectors_2d = pca.fit_transform(document_vectors)

# Plot the scatter plot
plt.figure(figsize=(8, 6))
for cluster in range(num_clusters):
    plt.scatter(document_vectors_2d[cluster_labels == cluster, 0],
                document_vectors_2d[cluster_labels == cluster, 1],
                label=f'Cluster {cluster+1}')
plt.title('K-means Clustering Results')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


##### Read the clusters and corresponding documents

In [None]:
for i in range (250):
  cluster_to_inspect = i
  documents_in_cluster = [doc for doc, label in zip(docs, cluster_labels) if label == cluster_to_inspect]

  # Print or display the documents
  for i, doc in enumerate(documents_in_cluster):
      print(f"Document {i+1} in Cluster {cluster_to_inspect+1}:")
      print(doc)
      print("-" * 50)

## Sum

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import concurrent.futures

# Range of clusters to try
min_clusters = 2
max_clusters = 1800
range_clusters = range(min_clusters, max_clusters + 1)

# Function to compute WCSS and Silhouette Score for a given number of clusters
def compute_metrics(num_clusters):
    print("Clustering: ", num_clusters, '/', max_clusters)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(sum)
    wcss = kmeans.inertia_
    silhouette_avg = silhouette_score(document_vectors, kmeans.labels_)
    return num_clusters, wcss, silhouette_avg

# Lists to store evaluation metrics
wcss_values = []
silhouette_scores = []

# Multithreaded computation
max_workers = 4  # Set the maximum number of workers
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks
    futures = [executor.submit(compute_metrics, num_clusters) for num_clusters in range_clusters]
    # Retrieve results
    for future in concurrent.futures.as_completed(futures):
        num_clusters, wcss, silhouette_avg = future.result()
        wcss_values.append(wcss)
        silhouette_scores.append(silhouette_avg)

# Plot the Elbow Method and Silhouette Scores
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range_clusters, wcss_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method')

plt.subplot(1, 2, 2)
plt.plot(range_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')

plt.tight_layout()
plt.show()

## Average

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import concurrent.futures

# Range of clusters to try
min_clusters = 2
max_clusters = 1800
range_clusters = range(min_clusters, max_clusters + 1)

# Function to compute WCSS and Silhouette Score for a given number of clusters
def compute_metrics(num_clusters):
    print("Clustering: ", num_clusters, '/', max_clusters)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(average)
    wcss = kmeans.inertia_
    silhouette_avg = silhouette_score(document_vectors, kmeans.labels_)
    return num_clusters, wcss, silhouette_avg

# Lists to store evaluation metrics
wcss_values = []
silhouette_scores = []

# Multithreaded computation
max_workers = 8  # Set the maximum number of workers
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks
    futures = [executor.submit(compute_metrics, num_clusters) for num_clusters in range_clusters]
    # Retrieve results
    for future in concurrent.futures.as_completed(futures):
        num_clusters, wcss, silhouette_avg = future.result()
        wcss_values.append(wcss)
        silhouette_scores.append(silhouette_avg)

# Plot the Elbow Method and Silhouette Scores
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range_clusters, wcss_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method')

plt.subplot(1, 2, 2)
plt.plot(range_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import numpy as np
import concurrent.futures

# Range of clusters to try
min_clusters = 50
max_clusters = 100
range_clusters = range(min_clusters, max_clusters + 1)

# Function to compute WCSS, Silhouette Score, and Gap Statistic for a given number of clusters
def compute_metrics(num_clusters):
    print("Clustering: ", num_clusters, '/', max_clusters)
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=20,random_state=42)
    kmeans.fit(document_vectors)
    wcss = kmeans.inertia_
    silhouette_avg = silhouette_score(document_vectors, kmeans.labels_)
    print("clustered: ",num_clusters)
    return num_clusters, wcss, silhouette_avg


# Lists to store evaluation metrics
wcss_values = []
silhouette_scores = []

# Convert document_vectors to numpy array
#document_vectors = np.array(document_vectors)

# Multithreaded computation
max_workers = 8  # Set the maximum number of workers
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks
    futures = [executor.submit(compute_metrics, num_clusters) for num_clusters in range_clusters]
    # Retrieve results
    for future in concurrent.futures.as_completed(futures):
        num_clusters, wcss, silhouette_avg = future.result()
        wcss_values.append(wcss)
        silhouette_scores.append(silhouette_avg)


# Plot the Elbow Method, Silhouette Scores, and Gap Statistic
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
plt.plot(range_clusters, wcss_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method')

plt.subplot(1, 3, 2)
plt.plot(range_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')


plt.tight_layout()
plt.show()


In [None]:
from sklearn.manifold import TSNE
best_k =  250
best_kmeans = KMeans(n_clusters=best_k, n_init=50, max_iter=300)
best_kmeans.fit(document_vectors)
print("done")


In [None]:
# Reduce dimensionality with t-SNE
tsne = TSNE(n_components=3, random_state=42)
X_tsne = tsne.fit_transform(document_vectors)

# Plot the result in 3D
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
for cluster in range(best_k):
    ax.scatter(X_tsne[best_kmeans.labels_ == cluster, 0],
               X_tsne[best_kmeans.labels_ == cluster, 1],
               X_tsne[best_kmeans.labels_ == cluster, 2],
               label=f'Cluster {cluster}')
ax.set_title(f'KMeans Clustering Result with {best_k} Clusters (t-SNE)')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
ax.legend()
plt.show()

## Tf-Idf

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import concurrent.futures

# Range of clusters to try
min_clusters = 2
max_clusters = 1800
range_clusters = range(min_clusters, max_clusters + 1)

# Function to compute WCSS and Silhouette Score for a given number of clusters
def compute_metrics(num_clusters):
    print("Clustering: ", num_clusters, '/', max_clusters)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(tfidf_matrix)
    wcss = kmeans.inertia_
    silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)
    return num_clusters, wcss, silhouette_avg

# Lists to store evaluation metrics
wcss_values = []
silhouette_scores = []

# Multithreaded computation
max_workers = 2  # Set the maximum number of workers
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks
    futures = [executor.submit(compute_metrics, num_clusters) for num_clusters in range_clusters]
    # Retrieve results
    for future in concurrent.futures.as_completed(futures):
        num_clusters, wcss, silhouette_avg = future.result()
        wcss_values.append(wcss)
        silhouette_scores.append(silhouette_avg)

In [None]:

# Plot the Elbow Method and Silhouette Scores
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range_clusters, wcss_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method')

plt.subplot(1, 2, 2)
plt.plot(range_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')

plt.tight_layout()
plt.show()

# Using LDA

## TF-IDF

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

num_topics = 10

# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit LDA model to TF-IDF matrix
lda_model.fit(tfidf_matrix)

# Print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

# Get feature names from TfidfVectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print topics
print_top_words(lda_model, feature_names, n_top_words=10)

# Save topics to a text file
with open('topics.txt', 'w') as f:
    for topic_idx, topic in enumerate(lda_model.components_):
        f.write(f"Topic {topic_idx}:\n")
        f.write(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]) + '\n')


In [None]:
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Check the length of the mails list
print("Length of mails:", len(mails))

num_topics = 10

# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit LDA model to TF-IDF matrix
lda_model.fit(tfidf_matrix)

# Check the length of lda_model.components_
print("Length of lda_model.components_:", len(lda_model.components_))

# Get document-topic distribution
doc_topic_distribution = lda_model.transform(tfidf_matrix)

# Check the shape of doc_topic_distribution
print("Shape of doc_topic_distribution:", doc_topic_distribution.shape)

In [None]:
with open('document_topic_assignment.txt', 'w') as f:
    for i, doc_topics in enumerate(doc_topic_distribution):
        print("Document index:", i)
        sorted_topics = np.argsort(doc_topics)[::-1]  # Sort topics by probability in descending order
        for j, topic_idx in enumerate(sorted_topics):
            print("Topic index:", topic_idx)
            print("Length of mails:", len(mails))
            print("Length of lda_model.components_:", len(lda_model.components_))
            f.write(f"Topic {topic_idx}: Probability={doc_topics[topic_idx]}\n")
            # Include top N documents most associated with the current topic
            if j == 0:  # Adjust N as needed
                f.write("Associated Documents:\n")
                top_docs_indices = np.argsort(lda_model.components_[topic_idx])[::-1][:5]  # Top 5 associated documents
                print("Top document indices:", top_docs_indices)
                print("Length of top_docs_indices:", len(top_docs_indices))
                if len(top_docs_indices) > 0:  # Check if top_docs_indices is not empty
                    for doc_idx in top_docs_indices:
                        if doc_idx < len(mails):  # Check if doc_idx is within valid range
                            print("Document index within top_docs_indices loop:", doc_idx)
                            f.write(f"Document {doc_idx}: {mails[doc_idx]}\n")
                        else:
                            print(f"Ignore document index {doc_idx} as it is out of range")
                else:
                    print(f"No associated documents found for topic {topic_idx}")
        f.write('\n')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
import numpy as np


# Define pipeline with TF-IDF, LDA, and K-means
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lda', LatentDirichletAllocation(n_components=10, random_state=42)),
    ('norm', Normalizer()),  # Normalize LDA topic distributions
    ('kmeans', KMeans(n_clusters=5, random_state=42))
])

# Fit pipeline to data
pipeline.fit(mails)

# Get document-topic distributions from LDA
lda_topics = pipeline.named_steps['lda'].transform(pipeline.named_steps['tfidf'].transform(mails))

# Get cluster assignments from K-means
kmeans_clusters = pipeline.named_steps['kmeans'].labels_

# Output results
for i in range(len(mails)):
    print(f"Document {mails[i]}:")
    print(f"  LDA Topics: {lda_topics[i]}")
    print(f"  K-means Cluster: {kmeans_clusters[i]}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Define range of cluster values to evaluate
cluster_values = range(2, 50)
wcss_scores = []
silhouette_scores = []

for n_clusters in cluster_values:
    # Update K-means cluster value
    pipeline.named_steps['kmeans'].set_params(n_clusters=n_clusters)

    # Fit pipeline to data
    pipeline.fit(mails)

    # Get cluster assignments from K-means
    kmeans_clusters = pipeline.named_steps['kmeans'].labels_

    # Calculate Within-Cluster Sum of Squares (WCSS)
    wcss = pipeline.named_steps['kmeans'].inertia_
    wcss_scores.append(wcss)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(lda_topics, kmeans_clusters)
    silhouette_scores.append(silhouette_avg)

# Plot WCSS scores
plt.figure(figsize=(10, 5))
plt.plot(cluster_values, wcss_scores, marker='o', linestyle='-')
plt.title('Within-Cluster Sum of Squares (WCSS) vs. Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.xticks(cluster_values)
plt.grid(True)
plt.show()

# Plot silhouette scores
plt.figure(figsize=(10, 5))
plt.plot(cluster_values, silhouette_scores, marker='o', linestyle='-')
plt.title('Silhouette Score vs. Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(cluster_values)
plt.grid(True)
plt.show()


In [None]:
# Define range of topic values and cluster values to evaluate
topic_values = range(2, 25)
cluster_values = range(2, 35)

for n_topics in topic_values:
    print(f"Analyzing topics: {n_topics}")
    wcss_scores = []
    silhouette_scores = []

    for n_clusters in cluster_values:
        print(f"  Analyzing clusters: {n_clusters}")
        # Define pipeline with TF-IDF, LDA, and K-means
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('lda', LatentDirichletAllocation(n_components=n_topics, random_state=42)),
            ('norm', Normalizer()),  # Normalize LDA topic distributions
            ('kmeans', KMeans(n_clusters=n_clusters, random_state=42))
        ])

        # Fit pipeline to data
        pipeline.fit(mails)

        # Get document-topic distributions from LDA
        lda_topics = pipeline.named_steps['lda'].transform(pipeline.named_steps['tfidf'].transform(mails))

        # Get cluster assignments from K-means
        kmeans_clusters = pipeline.named_steps['kmeans'].labels_

        # Calculate Within-Cluster Sum of Squares (WCSS)
        wcss = pipeline.named_steps['kmeans'].inertia_
        wcss_scores.append(wcss)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(lda_topics, kmeans_clusters)
        silhouette_scores.append(silhouette_avg)

    # Plot WCSS scores for each cluster value
    plt.figure(figsize=(10, 5))
    plt.plot(cluster_values, wcss_scores, marker='o', linestyle='-')
    plt.title(f'WCSS vs. Number of Clusters (Topics={n_topics})')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.xticks(cluster_values)
    plt.grid(True)
    plt.show()

    # Plot silhouette scores for each cluster value
    plt.figure(figsize=(10, 5))
    plt.plot(cluster_values, silhouette_scores, marker='o', linestyle='-')
    plt.title(f'Silhouette Score vs. Number of Clusters (Topics={n_topics})')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.xticks(cluster_values)
    plt.grid(True)
    plt.show()


In [None]:
# Define range of topic values and cluster values to evaluate
topic_values = range(2, 30)
cluster_values = range(2, 40)

for n_topics in topic_values:
    print(f"Analyzing topics: {n_topics}")
    wcss_scores = []
    silhouette_scores = []

    for n_clusters in cluster_values:
        print(f"  Analyzing clusters: {n_clusters}")
        # Define pipeline with TF-IDF, LDA, and K-means
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('lda', LatentDirichletAllocation(n_components=n_topics, random_state=42)),
            ('norm', Normalizer()),  # Normalize LDA topic distributions
            ('kmeans', KMeans(n_clusters=n_clusters, random_state=42, n_init=30))
        ])

        # Fit pipeline to data
        pipeline.fit(mails)

        # Get document-topic distributions from LDA
        lda_topics = pipeline.named_steps['lda'].transform(pipeline.named_steps['tfidf'].transform(mails))

        # Get cluster assignments from K-means
        kmeans_clusters = pipeline.named_steps['kmeans'].labels_

        # Calculate Within-Cluster Sum of Squares (WCSS)
        wcss = pipeline.named_steps['kmeans'].inertia_
        wcss_scores.append(wcss)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(lda_topics, kmeans_clusters)
        silhouette_scores.append(silhouette_avg)

    # Plot WCSS scores for each cluster value
    plt.figure(figsize=(10, 5))
    plt.plot(cluster_values, wcss_scores, marker='o', linestyle='-')
    plt.title(f'WCSS vs. Number of Clusters (Topics={n_topics})')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.xticks(cluster_values)
    plt.grid(True)
    plt.show()

    # Plot silhouette scores for each cluster value
    plt.figure(figsize=(10, 5))
    plt.plot(cluster_values, silhouette_scores, marker='o', linestyle='-')
    plt.title(f'Silhouette Score vs. Number of Clusters (Topics={n_topics})')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.xticks(cluster_values)
    plt.grid(True)
    plt.show()


## BoW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Define range of topic values and cluster values to evaluate
topic_values = range(2, 25)
cluster_values = range(2, 20)

for n_topics in topic_values:
    print(f"Analyzing topics: {n_topics}")
    wcss_scores = []
    silhouette_scores = []

    for n_clusters in cluster_values:
        print(f"  Analyzing clusters: {n_clusters}")
        # Define pipeline with BoW, LDA, and K-means
        pipeline = Pipeline([
            ('bow', CountVectorizer()),  # Changed to CountVectorizer
            ('lda', LatentDirichletAllocation(n_components=n_topics, random_state=42)),
            ('norm', Normalizer()),  # Normalize LDA topic distributions
            ('kmeans', KMeans(n_clusters=n_clusters, random_state=42))
        ])

        # Fit pipeline to data
        pipeline.fit(mails)

        # Get document-topic distributions from LDA
        lda_topics = pipeline.named_steps['lda'].transform(pipeline.named_steps['bow'].transform(mails))

        # Get cluster assignments from K-means
        kmeans_clusters = pipeline.named_steps['kmeans'].labels_

        # Calculate Within-Cluster Sum of Squares (WCSS)
        wcss = pipeline.named_steps['kmeans'].inertia_
        wcss_scores.append(wcss)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(lda_topics, kmeans_clusters)
        silhouette_scores.append(silhouette_avg)

    # Plot WCSS scores for each cluster value
    plt.figure(figsize=(10, 5))
    plt.plot(cluster_values, wcss_scores, marker='o', linestyle='-')
    plt.title(f'WCSS vs. Number of Clusters (Topics={n_topics})')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.xticks(cluster_values)
    plt.grid(True)
    plt.show()

    # Plot silhouette scores for each cluster value
    plt.figure(figsize=(10, 5))
    plt.plot(cluster_values, silhouette_scores, marker='o', linestyle='-')
    plt.title(f'Silhouette Score vs. Number of Clusters (Topics={n_topics})')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.xticks(cluster_values)
    plt.grid(True)
    plt.show()


In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

# Define range of topic values to evaluate
topic_values = range(2, 20)
perplexity_scores = []

for n_topics in topic_values:
    print(f"Analyzing topics: {n_topics}")

    # Define pipeline with BoW and LDA
    pipeline = Pipeline([
        ('bow', CountVectorizer()),  # Changed to CountVectorizer
        ('lda', LatentDirichletAllocation(n_components=n_topics, random_state=42))
    ])

    # Fit pipeline to data
    pipeline.fit(mails)

    # Calculate perplexity
    perplexity = pipeline.named_steps['lda'].perplexity(pipeline.named_steps['bow'].transform(mails))
    perplexity_scores.append(perplexity)

# Plot perplexity scores for each topic value
plt.figure(figsize=(10, 5))
plt.plot(topic_values, perplexity_scores, marker='o', linestyle='-')
plt.title('Perplexity vs. Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity')
plt.grid(True)
plt.show()


In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import numpy as np
import matplotlib.pyplot as plt

# Define range of topic values to evaluate
topic_values = range(2, 25)
perplexity_scores = []
coherence_scores = []
diversity_scores = []

# Tokenize each email string into a list of words
tokenized_mails = [mail.split() for mail in mails]

# Create dictionary from the tokenized emails
dictionary = Dictionary(tokenized_mails)

for n_topics in topic_values:
    print(f"Analyzing topics: {n_topics}")

    # Create bag-of-words corpus
    corpus = [dictionary.doc2bow(text) for text in tokenized_mails]  # Use tokenized_mails instead of mails

    # Fit LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, random_state=42)

    # Calculate perplexity
    perplexity = lda_model.log_perplexity(corpus)
    perplexity_scores.append(np.exp2(-perplexity))

    # Calculate coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_mails, dictionary=dictionary, coherence='c_v')  # Use tokenized_mails instead of mails
    coherence = coherence_model_lda.get_coherence()
    coherence_scores.append(coherence)

    # Calculate topic diversity
    topics = lda_model.get_topics()
    diversity = np.mean([len(set(topic)) / len(topic) for topic in topics])
    diversity_scores.append(diversity)

# Plot all metrics
fig, ax1 = plt.subplots(figsize=(10, 5))

color = 'tab:red'
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(topic_values, perplexity_scores, marker='o', linestyle='-', color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(topic_values, coherence_scores, marker='o', linestyle='-', color=color)
ax2.tick_params(axis='y', labelcolor=color)

ax3 = ax1.twinx()
color = 'tab:green'
ax3.spines['right'].set_position(('outward', 60))  # move the spine
ax3.set_ylabel('Diversity', color=color)
ax3.plot(topic_values, diversity_scores, marker='o', linestyle='-', color=color)
ax3.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Evaluation Metrics vs. Number of Topics')
plt.show()


In [None]:
import matplotlib.pyplot as plt
# Plot all metrics
fig, ax1 = plt.subplots(figsize=(10, 5))

color = 'tab:red'
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(topic_values, perplexity_scores, marker='o', linestyle='-', color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(topic_values, coherence_scores, marker='o', linestyle='-', color=color)
ax2.tick_params(axis='y', labelcolor=color)

ax3 = ax1.twinx()
color = 'tab:green'
ax3.spines['right'].set_position(('outward', 60))  # move the spine
ax3.set_ylabel('Diversity', color=color)
ax3.plot(topic_values, diversity_scores, marker='o', linestyle='-', color=color)
ax3.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Evaluation Metrics vs. Number of Topics')
plt.show()


In [None]:
from gensim.models import TfidfModel
from gensim import corpora
import matplotlib.pyplot as plt
import numpy as np

# Define range of topic values to evaluate
topic_values = range(2, 25)
perplexity_scores = []
coherence_scores = []
diversity_scores = []

# Tokenize each email string into a list of words
tokenized_mails = [mail.split() for mail in mails]

# Create dictionary from the tokenized emails
dictionary = corpora.Dictionary(tokenized_mails)

# Create TF-IDF representation
tfidf = TfidfModel(dictionary=dictionary)
corpus_tfidf = tfidf[[dictionary.doc2bow(text) for text in tokenized_mails]]

for n_topics in topic_values:
    print(f"Analyzing topics: {n_topics}")

    # Fit LDA model
    lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics, random_state=42)

    # Calculate perplexity
    perplexity = lda_model.log_perplexity(corpus_tfidf)
    perplexity_scores.append(np.exp2(-perplexity))

    # Calculate coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_mails, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model_lda.get_coherence()
    coherence_scores.append(coherence)

    # Calculate topic diversity
    topics = lda_model.get_topics()
    diversity = np.mean([len(set(topic)) / len(topic) for topic in topics])
    diversity_scores.append(diversity)

# Plot all metrics
fig, ax1 = plt.subplots(figsize=(10, 5))

color = 'tab:red'
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(topic_values, perplexity_scores, marker='o', linestyle='-', color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(topic_values, coherence_scores, marker='o', linestyle='-', color=color)
ax2.tick_params(axis='y', labelcolor=color)

ax3 = ax1.twinx()
color = 'tab:green'
ax3.spines['right'].set_position(('outward', 60))  # move the spine
ax3.set_ylabel('Diversity', color=color)
ax3.plot(topic_values, diversity_scores, marker='o', linestyle='-', color=color)
ax3.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Evaluation Metrics vs. Number of Topics')
plt.show()


# Using SVM with LDA

In [None]:
import os
mails=[]
file_location=[]
for file in os.listdir('Corpus'):
    if file!='The_whole_corpus.txt':
        with open(f"Corpus/{(file)}",'r',encoding='utf-8') as f:
            mails.append(f.read())
            file_location.append(f"Corpus/{(file)}")
            #print("added: ",file)

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Tokenize each email string into a list of words
tokenized_mails = [mail.split() for mail in mails]

# Create dictionary from the tokenized emails
dictionary = Dictionary(tokenized_mails)

# Create bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in tokenized_mails]

# Define the number of topics
num_topics = 9

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)

# Print topics and their top words
for topic_id, topic_words in lda_model.print_topics():
    print(f"Topic {topic_id + 1}: {topic_words}")


In [None]:
import gensim
from gensim import corpora
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Preprocess each email in 'mails' list
preprocessed_mails = [mail.split() for mail in mails]

# Create a dictionary representation of the emails
dictionary = corpora.Dictionary(preprocessed_mails)

# Create a document-term matrix
corpus = [dictionary.doc2bow(mail) for mail in preprocessed_mails]

# Train the LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=9, id2word=dictionary, passes=15)

# Get the topic distributions for each document
topic_distributions = [lda_model.get_document_topics(doc) for doc in corpus]

# Extract topic proportions for each document
topic_proportions = np.zeros((len(topic_distributions), lda_model.num_topics))
for i, doc_topics in enumerate(topic_distributions):
    for topic, proportion in doc_topics:
        topic_proportions[i, topic] = proportion

# Perform dimensionality reduction using t-SNE
tsne_model = TSNE(n_components=2, random_state=42)
tsne_topics = tsne_model.fit_transform(topic_proportions)

# Create DataFrame for visualization
df_tsne = pd.DataFrame(tsne_topics, columns=['Dimension 1', 'Dimension 2'])

# Plot the 2D visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_tsne, x='Dimension 1', y='Dimension 2')
plt.title('2D Visualization of LDA Topics')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()


In [None]:
import gensim
from gensim import corpora
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Preprocess each email in 'mails' list
preprocessed_mails = [mail.split() for mail in mails]

# Create a dictionary representation of the emails
dictionary = corpora.Dictionary(preprocessed_mails)

# Create a document-term matrix
corpus = [dictionary.doc2bow(mail) for mail in preprocessed_mails]

# Train the LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=9, id2word=dictionary, passes=15)

# Get the topic distributions for each document
topic_distributions = [lda_model.get_document_topics(doc) for doc in corpus]

In [None]:
# Extract topic proportions for each document
topic_proportions = np.zeros((len(topic_distributions), lda_model.num_topics))
for i, doc_topics in enumerate(topic_distributions):
    for topic, proportion in doc_topics:
        topic_proportions[i, topic] = proportion

# Print the filled topic proportions
for i, topic_dist in enumerate(topic_proportions):
    print(i, ":", topic_dist)


In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Range of clusters to try
min_clusters = 2
max_clusters = 30
cluster_range = range(min_clusters, max_clusters + 1)

# Initialize lists to store silhouette scores and WCSS
silhouette_scores = []
wcss = []

# Calculate silhouette score and WCSS for each cluster
for n_clusters in cluster_range:
    # Initialize KMeans with the current number of clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)

    # Fit KMeans to the topic proportions
    kmeans.fit(topic_proportions)

    # Get cluster labels and cluster centers
    cluster_labels = kmeans.labels_
    cluster_centers = kmeans.cluster_centers_

    # Calculate silhouette score
    silhouette_scores.append(silhouette_score(topic_proportions, cluster_labels))

    # Calculate WCSS
    wcss.append(kmeans.inertia_)

# Plot silhouette score vs number of clusters
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')

# Plot WCSS vs number of clusters
plt.subplot(1, 2, 2)
plt.plot(cluster_range, wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('WCSS vs Number of Clusters')

plt.tight_layout()
plt.show()


In [None]:
# Hardcoding the optimal number of clusters
optimal_num_clusters = 8

# Initialize KMeans with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=42, n_init=40)

# Fit KMeans to the topic proportions
kmeans.fit(topic_proportions)

# Get cluster labels
cluster_labels = kmeans.labels_

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
topic_tsne = tsne.fit_transform(topic_proportions)

# Create DataFrame for visualization
df_tsne = pd.DataFrame(topic_tsne, columns=['tsne1', 'tsne2'])
df_tsne['Cluster'] = cluster_labels

# Visualize clusters using seaborn
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', hue='Cluster', palette='viridis', alpha=0.8)
plt.title('t-SNE Visualization of Clusters')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Cluster')
plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Apply t-SNE for dimensionality reduction with 3 components
tsne = TSNE(n_components=3, random_state=42)
topic_tsne_3d = tsne.fit_transform(topic_proportions)

# Create DataFrame for visualization
df_tsne_3d = pd.DataFrame(topic_tsne_3d, columns=['tsne1', 'tsne2', 'tsne3'])
df_tsne_3d['Cluster'] = cluster_labels

# Visualize clusters using matplotlib 3D scatter plot
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Plot each cluster
for cluster in range(optimal_num_clusters):
    cluster_data = df_tsne_3d[df_tsne_3d['Cluster'] == cluster]
    ax.scatter(cluster_data['tsne1'], cluster_data['tsne2'], cluster_data['tsne3'], label=f'Cluster {cluster}')

ax.set_title('3D t-SNE Visualization of Clusters')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
ax.legend()

plt.show()


In [None]:
# Initialize an empty list to store the average topic distributions for each cluster
cluster_topic_distributions = []

# Calculate the average topic distribution for each cluster
for cluster in range(optimal_num_clusters):
    # Filter documents belonging to the current cluster
    cluster_indices = np.where(cluster_labels == cluster)[0]
    cluster_topic_dist = np.mean(topic_proportions[cluster_indices], axis=0)
    cluster_topic_distributions.append(cluster_topic_dist)

# Convert the list to a numpy array for easier manipulation
cluster_topic_distributions = np.array(cluster_topic_distributions)

# Print the top topics for each cluster
for cluster, topic_dist in enumerate(cluster_topic_distributions):
    print(f"Cluster {cluster}:")
    top_topics = np.argsort(topic_dist)[::-1][:3]  # Get indices of top 3 topics
    for topic_idx in top_topics:
        topic_words = lda_model.show_topic(topic_idx)  # Get top words for the topic
        print(f"    Topic {topic_idx}: {topic_words}")


## TF-IDF

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Function to load emails from files
def load_emails_from_files(directory):
    emails = []
    file_locations = []
    for file in os.listdir(directory):
        if file != 'The_whole_corpus.txt':
            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                emails.append(f.read())
                file_locations.append(os.path.join(directory, file))
    return emails, file_locations

# Load emails from files
emails, file_locations = load_emails_from_files('Corpus')

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the data with TF-IDF Vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(emails)

# Get feature names
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:

# Define the number of topics
num_topics = 9

# Initialize LDA model
lda_model = LDA(n_components=num_topics, random_state=42)

# Fit LDA model to the TF-IDF matrix
lda_model.fit(tfidf_matrix)

# Function to display topics and their top words
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

# Print topics and their top words
print("Top words per topic:")
display_topics(lda_model, tfidf_feature_names, 10)

# Transform TF-IDF matrix into topic distributions
topic_distributions = lda_model.transform(tfidf_matrix)

# Perform dimensionality reduction using t-SNE
tsne_model = TSNE(n_components=2, random_state=42)
tsne_topics = tsne_model.fit_transform(topic_distributions)

# Plot the 2D visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(x=tsne_topics[:, 0], y=tsne_topics[:, 1], alpha=0.7)
plt.title('2D Visualization of LDA Topics (TF-IDF)')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()

# Get topic proportions for each document
topic_proportions = np.zeros((len(topic_distributions), num_topics))
for i, doc_topic in enumerate(topic_distributions):
    for j, proportion in enumerate(doc_topic):
        topic_proportions[i, j] = proportion

# Calculate silhouette scores and within-cluster sum of squares (WCSS) for different number of clusters
min_clusters = 2
max_clusters = 30
cluster_range = range(min_clusters, max_clusters + 1)
silhouette_scores = []
wcss = []

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)
    kmeans.fit(topic_proportions)
    cluster_labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(topic_proportions, cluster_labels))
    wcss.append(kmeans.inertia_)

# Plot silhouette scores vs number of clusters
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')

# Plot WCSS vs number of clusters
plt.subplot(1, 2, 2)
plt.plot(cluster_range, wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('WCSS vs Number of Clusters')

plt.tight_layout()
plt.show()

# Hardcoding the optimal number of clusters
optimal_num_clusters = 8

# Initialize KMeans with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=42, n_init=40)
kmeans.fit(topic_proportions)
cluster_labels = kmeans.labels_

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
topic_tsne = tsne.fit_transform(topic_proportions)

# Create DataFrame for visualization
df_tsne = pd.DataFrame(topic_tsne, columns=['tsne1', 'tsne2'])
df_tsne['Cluster'] = cluster_labels

# Visualize clusters using seaborn
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', hue='Cluster', palette='viridis', alpha=0.8)
plt.title('t-SNE Visualization of Clusters (TF-IDF)')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Cluster')
plt.show()


### test

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Apply t-SNE for dimensionality reduction with 3 components
tsne = TSNE(n_components=3, random_state=42)
topic_tsne_3d = tsne.fit_transform(topic_proportions)

# Create DataFrame for visualization
df_tsne_3d = pd.DataFrame(topic_tsne_3d, columns=['tsne1', 'tsne2', 'tsne3'])
df_tsne_3d['Cluster'] = cluster_labels

# Visualize clusters using matplotlib 3D scatter plot
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Plot each cluster
for cluster in range(optimal_num_clusters):
    cluster_data = df_tsne_3d[df_tsne_3d['Cluster'] == cluster]
    ax.scatter(cluster_data['tsne1'], cluster_data['tsne2'], cluster_data['tsne3'], label=f'Cluster {cluster}')

ax.set_title('3D t-SNE Visualization of Clusters (TF-IDF)')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
ax.legend()

plt.show()


## DBSCAN for clustering

In [None]:
from sklearn.cluster import DBSCAN

# Initialize DBSCAN with epsilon and minimum samples
epsilon = 0.3
min_samples = 5
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples)

# Fit DBSCAN to the topic proportions
dbscan.fit(topic_proportions)

# Get cluster labels
cluster_labels = dbscan.labels_

# Apply t-SNE for dimensionality reduction with 3 components
tsne = TSNE(n_components=3, random_state=42)
topic_tsne_3d = tsne.fit_transform(topic_proportions)

# Create DataFrame for visualization
df_tsne_3d = pd.DataFrame(topic_tsne_3d, columns=['tsne1', 'tsne2', 'tsne3'])
df_tsne_3d['Cluster'] = cluster_labels

# Visualize clusters using matplotlib 3D scatter plot
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Plot each cluster
unique_labels = np.unique(cluster_labels)
for label in unique_labels:
    if label == -1:
        # Noise points are plotted as black
        cluster_data = df_tsne_3d[df_tsne_3d['Cluster'] == label]
        ax.scatter(cluster_data['tsne1'], cluster_data['tsne2'], cluster_data['tsne3'], c='black', label=f'Cluster {label}')
    else:
        cluster_data = df_tsne_3d[df_tsne_3d['Cluster'] == label]
        ax.scatter(cluster_data['tsne1'], cluster_data['tsne2'], cluster_data['tsne3'], label=f'Cluster {label}')

ax.set_title('3D t-SNE Visualization of Clusters with DBSCAN (TF-IDF)')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
ax.legend()

plt.show()

In [None]:
# Get unique cluster labels
unique_labels = np.unique(cluster_labels)

# Count the number of clusters (excluding noise points labeled as -1)
num_clusters = len(unique_labels) - 1 if -1 in unique_labels else len(unique_labels)

print("Number of clusters:", num_clusters)


In [None]:
# Range of epsilon values to try
epsilon_values = [10]

# Plotting
plt.figure(figsize=(15, 10))

for i, epsilon in enumerate(epsilon_values, 1):
    print("doing",i)
    # Initialize DBSCAN with the current epsilon value
    dbscan = DBSCAN(eps=epsilon, min_samples=5)
    dbscan.fit(topic_proportions)

    # Get cluster labels
    cluster_labels = dbscan.labels_

    # Apply t-SNE for dimensionality reduction with 2 components
    tsne = TSNE(n_components=2, random_state=42)
    topic_tsne = tsne.fit_transform(topic_proportions)

    # Create DataFrame for visualization
    df_tsne = pd.DataFrame(topic_tsne, columns=['tsne1', 'tsne2'])
    df_tsne['Cluster'] = cluster_labels

    # Plot clusters
    plt.subplot(2, 3, i)
    sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', hue='Cluster', palette='viridis', alpha=0.8)
    plt.title(f'DBSCAN Clustering (epsilon={epsilon})')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.legend(title='Cluster')

plt.tight_layout()
plt.show()


In [None]:
# Range of min_samples values to try
min_samples_values = [10,15,20,25]

# Plotting
plt.figure(figsize=(15, 10))

for i, min_samples in enumerate(min_samples_values, 1):
    # Initialize DBSCAN with the specified min_samples value and epsilon=0.1
    dbscan = DBSCAN(eps=0.1, min_samples=min_samples)
    dbscan.fit(topic_proportions)

    # Get cluster labels
    cluster_labels = dbscan.labels_

    # Apply t-SNE for dimensionality reduction with 2 components
    tsne = TSNE(n_components=2, random_state=42)
    topic_tsne = tsne.fit_transform(topic_proportions)

    # Create DataFrame for visualization
    df_tsne = pd.DataFrame(topic_tsne, columns=['tsne1', 'tsne2'])
    df_tsne['Cluster'] = cluster_labels

    # Plot clusters
    plt.subplot(2, 2, i)
    sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', hue='Cluster', palette='viridis', alpha=0.8)
    plt.title(f'DBSCAN Clustering (epsilon=0.1, min_samples={min_samples})')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.legend(title='Cluster')

plt.tight_layout()
plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Range of min_samples values to try
min_samples_values = [1,2,3,4,5,6,7,8,9,10]

# Plotting
plt.figure(figsize=(15, 10))

for i, min_samples in enumerate(min_samples_values, 1):
    print(i)
    # Initialize DBSCAN with the specified min_samples value and epsilon=0.1
    dbscan = DBSCAN(eps=0.1, min_samples=min_samples)
    dbscan.fit(topic_proportions)

    # Get cluster labels
    cluster_labels = dbscan.labels_

    # Apply t-SNE for dimensionality reduction with 3 components
    tsne = TSNE(n_components=3, random_state=42)
    topic_tsne_3d = tsne.fit_transform(topic_proportions)

    # Create DataFrame for visualization
    df_tsne_3d = pd.DataFrame(topic_tsne_3d, columns=['tsne1', 'tsne2', 'tsne3'])
    df_tsne_3d['Cluster'] = cluster_labels

    # Plot clusters
    ax = plt.subplot(2, 2, i, projection='3d')
    for cluster in np.unique(cluster_labels):
        if cluster == -1:
            ax.scatter(df_tsne_3d.loc[df_tsne_3d['Cluster'] == cluster, 'tsne1'],
                       df_tsne_3d.loc[df_tsne_3d['Cluster'] == cluster, 'tsne2'],
                       df_tsne_3d.loc[df_tsne_3d['Cluster'] == cluster, 'tsne3'],
                       label=f'Cluster {cluster}', c='black', alpha=0.8)
        else:
            ax.scatter(df_tsne_3d.loc[df_tsne_3d['Cluster'] == cluster, 'tsne1'],
                       df_tsne_3d.loc[df_tsne_3d['Cluster'] == cluster, 'tsne2'],
                       df_tsne_3d.loc[df_tsne_3d['Cluster'] == cluster, 'tsne3'],
                       label=f'Cluster {cluster}', alpha=0.8)

    ax.set_title(f'DBSCAN Clustering (epsilon=0.1, min_samples={min_samples})')
    ax.set_xlabel('t-SNE Component 1')
    ax.set_ylabel('t-SNE Component 2')
    ax.set_zlabel('t-SNE Component 3')
    ax.legend()

plt.tight_layout()
plt.show()
