In [None]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

import torch

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score

from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel

In [None]:
dataset = pd.read_csv("../../data/processed/cleaned_twitch_messages.csv")

In [None]:
df = pd.DataFrame(dataset["message"])

In [None]:
df

# Clustering
https://medium.com/@danielafrimi/text-clustering-using-nlp-techniques-c2e6b08b6e95

## TF-IDF vectorization

In [None]:
print("Embedding with method: Tf-Idf")

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
X = vectorizer.fit_transform(df['message']).toarray()

## Sentence transformer

In [None]:
print("Embedding with method: Sentence transformer")

st = time.time()

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
df['encode_transforemers'] = df['message'].apply(lambda text: model.encode(text, convert_to_numpy=True).flatten())

et = time.time()

print("Elapsed time: {:.2f} seconds".format(et - st))

X_transformers = np.vstack(df['encode_transforemers'])

## BERT - [CLS] token for sentence context

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_cls_sentence(sentence):
    # Tokenize input sentence and convert to tensor
    input_ids = torch.tensor([tokenizer.encode(sentence, add_special_tokens=True, max_length=512)])

    # Pass input through BERT model and extract embeddings for [CLS] token
    with torch.no_grad():
        outputs = model(input_ids)
        cls_embedding = outputs[0][:, 0, :]
    
    return cls_embedding.flatten()

In [None]:
print("Embedding with method: BERT-[CLS]")

st = time.time()
df['cls_bert'] = df['message'].apply(lambda sentence: get_cls_sentence(sentence))
et = time.time()

print("Elapsed time: {:.2f} seconds".format(et - st))

X_cls_bert = np.vstack(df['cls_bert'])

## Salvataggio dei risultati

In [None]:
df.to_csv("../../data/processed/twitch_messages_with_embeddings.csv", index=False)

## Clustering e visualizzazione

In [None]:
#def eval_cluster(embedding, kmeans):
#    y_pred = kmeans.fit_predict(embedding)
#    
#    # Evaluate the performance using ARI, NMI, and FMI
#    ari = adjusted_rand_score(df["target"], y_pred)
#    nmi = normalized_mutual_info_score(df["target"], y_pred)
#    fmi = fowlkes_mallows_score(df["target"], y_pred)
#
#    # Print Metrics scores
#    print("Adjusted Rand Index (ARI): {:.3f}".format(ari))
#    print("Normalized Mutual Information (NMI): {:.3f}".format(nmi))
#    print("Fowlkes-Mallows Index (FMI): {:.3f}".format(fmi))

In [None]:
def dimension_reduction(embedding, method):

    pca = PCA(n_components=2, random_state=42)

    pca_vecs = pca.fit_transform(embedding)

    # save our two dimensions into x0 and x1
    x0 = pca_vecs[:, 0]
    x1 = pca_vecs[:, 1]
    
    df[f'x0_{method}'] = x0 
    df[f'x1_{method}'] = x1

In [None]:
def plot_pca(x0_name, x1_name, cluster_name, method):

    plt.figure(figsize=(12, 7))

    plt.title(f"KMeans clustering with {method}", fontdict={"fontsize": 18})
    plt.xlabel("X0", fontdict={"fontsize": 16})
    plt.ylabel("X1", fontdict={"fontsize": 16})

    sns.scatterplot(data=df, x=x0_name, y=x1_name, hue=cluster_name, palette="viridis")
    plt.show()

In [None]:
print("Clustering and visualization")

for embedding_and_method in [(X, 'tfidf'), (X_transformers, 'transformers'), ((X_cls_bert, 'Bert-CLS')) ]:
    embedding, method = embedding_and_method[0], embedding_and_method[1]
    
    # initialize kmeans with 3 centroids
    kmeans = KMeans(n_clusters=2, random_state=79872435)

    # fit the model
    kmeans.fit(embedding)

    # store cluster labels in a variable
    clusters = kmeans.labels_

    # Assign clusters to our dataframe
    clusters_result_name = f'cluster_{method}'
    df[clusters_result_name] = clusters
    
    #eval_cluster(embedding, kmeans)
    
    dimension_reduction(embedding, method)
    
    plot_pca(f'x0_{method}', f'x1_{method}', cluster_name=clusters_result_name, method=method)