In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import cohen_kappa_score, silhouette_score, confusion_matrix, adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, calinski_harabasz_score, davies_bouldin_score
from itertools import cycle
from scipy.sparse import issparse

In [None]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
from collections import Counter

In [None]:
# Run this ONLY IF Yellowbrick shows "findfont" errors

import matplotlib.font_manager
plt.rcParams['font.family'] = 'DejaVu Sans'

In [None]:
def combine_features(*arrays):
    return np.hstack(arrays)


In [None]:
DPPpath = '/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/features_'
def load_feature_set(path):

    # Load preprocessed features
    df_tfidf = pd.read_pickle(DPPpath + 'tfidf.pkl')
    df_bow = pd.read_pickle(DPPpath + 'bow.pkl')
    df_lda = pd.read_pickle(DPPpath + 'lda.pkl')
    df_wv = pd.read_pickle(DPPpath + 'word2vec.pkl')

    # Prepare data
    labels = df_tfidf['label']
    numeric_labels = df_tfidf['label_num'].values
    classes = np.unique(numeric_labels)
    from sklearn.preprocessing import label_binarize
    y_binarized = label_binarize(numeric_labels, classes=classes)

    # Feature matrices (drop label columns)
    X_tfidf = df_tfidf.drop(['label', 'label_num'], axis=1).values
    X_bow = df_bow.drop(['label', 'label_num'], axis=1).values
    X_lda = df_lda.drop(['label', 'label_num'], axis=1).values
    X_wv = df_wv.drop(['label', 'label_num'], axis=1).values

    feature_sets = {
        'TF-IDF': X_tfidf,
        'BoW': X_bow,
        'LDA': X_lda,
        'Word2Vec': X_wv,
        'TF-IDF + LDA': combine_features(X_tfidf, X_lda),
        'BoW + LDA': combine_features(X_bow, X_lda)
    }

    return labels, numeric_labels, feature_sets

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import joblib
from sklearn.metrics.pairwise import cosine_similarity


# Load the Excel file
df = pd.read_excel('/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx')

# Extract the list of documents (texts) you want to use for LDA
my_texts = df['Text'].astype(str).tolist()  # make sure all texts are strings


In [None]:

from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
def align_cluster_labels(y_true, y_pred):
    """
    Permute y_pred’s labels to best match y_true via Hungarian algorithm.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    true_ids = np.unique(y_true)
    pred_ids = np.unique(y_pred)

    cost = np.zeros((true_ids.size, pred_ids.size), dtype=int)
    for i, t in enumerate(true_ids):
        for j, p in enumerate(pred_ids):
            cost[i, j] = np.sum((y_true == t) & (y_pred == p))

    row_ind, col_ind = linear_sum_assignment(-cost)
    mapping = {pred_ids[col_ind[k]]: true_ids[row_ind[k]] for k in range(len(row_ind))}
    return np.array([mapping.get(x, x) for x in y_pred])

In [None]:
def plot_clusters(X, labels, method_name):
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X.toarray() if issparse(X) else X)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='viridis')
    plt.title(f'{method_name} Clustering (t-SNE visualization)')
    plt.colorbar(scatter)
    plt.show()


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def get_top_words_word2vec(word_vectors, cluster_centers, top_n=10):
    """
    Find top words closest to each cluster center in Word2Vec space

    Args:
        word_vectors: KeyedVectors (from gensim) or dict {word: vector}
        cluster_centers: Array of cluster centroids from clustering
        top_n: Number of top words to return per cluster

    Returns:
        List of lists containing top words for each cluster
    """
    # Prepare word vectors and vocabulary
    if hasattr(word_vectors, 'key_to_index'):  # Gensim 4.0+
        words = list(word_vectors.key_to_index.keys())
        vectors = word_vectors.vectors
    else:  # Older gensim or custom dict
        words = list(word_vectors.keys())
        vectors = np.array(list(word_vectors.values()))

    # Find nearest words to each centroid
    nbrs = NearestNeighbors(n_neighbors=top_n, metric='cosine').fit(vectors)
    top_indices = nbrs.kneighbors(cluster_centers, return_distance=False)

    return [[words[idx] for idx in cluster_top] for cluster_top in top_indices]

In [None]:
def em_cohen_single(X, actual_y, k = 5, raw_path = "", reduced_dim = None, plotting=False):
    doc_raw = pd.read_pickle(raw_path)
    documents = doc_raw['text'].astype(str).tolist()

    if reduced_dim:
        pca = PCA(n_components=reduced_dim, random_state=42)
        X = pca.fit_transform(X)

    X_dense = X.toarray() if issparse(X) else X



    model = GaussianMixture(n_components=k,
                              random_state=42)
    model.fit(X)
    cluster_labels = model.fit_predict(X_dense)

    aligned_labels = align_cluster_labels(actual_y, cluster_labels)
    # Evaluation metrics
    results = {}
    results['kappa'] = cohen_kappa_score(actual_y, aligned_labels)
    results['silhouette'] = silhouette_score(X, aligned_labels)
    results['adjusted_rand_score'] = adjusted_rand_score(actual_y, aligned_labels)
    results['normalized_mutual_info_score'] = normalized_mutual_info_score(actual_y, aligned_labels)
    results['homogeneity_score'] = homogeneity_score(actual_y, aligned_labels)
    results['completeness_score'] = completeness_score(actual_y, aligned_labels)
    results['v_measure_score'] = v_measure_score(actual_y, aligned_labels)
    results['calinski_harabasz_score'] = calinski_harabasz_score(X, aligned_labels)
    results['davies_bouldin_score'] = davies_bouldin_score(X, aligned_labels)
    top_words = []
    for cluster in range(k):
        # Get indices of documents in this cluster
        cluster_indices = np.where(cluster_labels == cluster)[0]

        # Collect all words from documents in this cluster
        cluster_words = []
        for idx in cluster_indices:
            cluster_words.extend(documents[idx].split())

        # Get most frequent words in this cluster
        word_counts = Counter(cluster_words)
        top_words.append([word for word, count in word_counts.most_common(10)])

    coherence_model = CoherenceModel(
        topics=top_words,
        texts=[doc.split() for doc in documents],
        dictionary=Dictionary([doc.split() for doc in documents]),
        coherence='c_v'
    )
    results['coherence'] = coherence_model.get_coherence()
    if plotting:
        plot_clusters(X, cluster_labels, 'EM')
        df = pd.DataFrame({'Actual': actual_y, 'Cluster': aligned_labels})
        counts = df.groupby(['Actual', 'Cluster']).size().unstack()
        # Plot
        counts.plot(kind='bar', stacked=True, figsize=(8, 4))
        plt.title('Distribution of Actual Labels Across Clusters')
        plt.ylabel('Count')
        plt.xlabel('Actual Label')
        plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1))
        plt.tight_layout()
        plt.show()

    print(results)
    return cluster_labels, results


In [None]:
labels, numeric_labels, feat = load_feature_set(DPPpath)

In [None]:
predicted_labels, results = em_cohen_single(feat['TF-IDF'], numeric_labels, 5,  '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/raw_labeled.pkl', reduced_dim = 150)

{'kappa': 0.5725, 'silhouette': 0.0491038342134897, 'adjusted_rand_score': 0.49039614972403117, 'normalized_mutual_info_score': 0.6152589579006824, 'homogeneity_score': 0.5887919436993898, 'completeness_score': 0.644217424299752, 'v_measure_score': 0.6152589579006824, 'calinski_harabasz_score': 33.6603388563682, 'davies_bouldin_score': 3.9171003321829154, 'coherence': 0.46687523716819224}


In [None]:
from itertools import product

def cohen_grid_search(X, actual_y,
                      k_values=[3, 5, 7],
                      reduced_dims=[None, 50, 100],
                      vect_path="", raw_path="",
                      plotting=False,
                      metric_name = 'adjusted_rand_score'):
    """
    Grid search wrapper for cohen_single. Evaluates combinations of k and reduced_dim.
    """
    best_score = -np.inf
    best_params = {}
    all_results = []

    for k, dim in product(k_values, reduced_dims):
        print(f"\nTesting k={k}, reduced_dim={dim}")
        c, results = em_cohen_single(X, actual_y, k=k, reduced_dim=dim,
                               raw_path=raw_path,
                               plotting=plotting)

        # print(results)
        metric = results[metric_name]
        all_results.append({'k': k, 'reduced_dim': dim, **results})

        if metric > best_score:
            best_score = metric
            best_params = {'k': k, 'reduced_dim': dim, 'results': results}

    print("\nBest Parameters:")
    print(best_params)
    return best_params, all_results


In [None]:
rawpath = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/raw_labeled.pkl'
vecpath = DPPpath + 'vectorizer_tfidf.pkl'
best_config, all_trials = cohen_grid_search(
    X=feat['TF-IDF'],
    actual_y=numeric_labels,
    k_values=[4,5,6,7],
    reduced_dims=[50, 100, 150, 200],
    vect_path=vecpath,
    raw_path=rawpath,
    plotting=False
)


Testing k=4, reduced_dim=50
{'kappa': 0.64625, 'silhouette': 0.09035967752888066, 'adjusted_rand_score': 0.5909294156304717, 'normalized_mutual_info_score': 0.6529049714578775, 'homogeneity_score': 0.5973749556723968, 'completeness_score': 0.7198167772210565, 'v_measure_score': 0.6529049714578776, 'calinski_harabasz_score': 69.94123463363604, 'davies_bouldin_score': 2.9925866902553873, 'coherence': 0.45026963290078126}

Testing k=4, reduced_dim=100
{'kappa': 0.65875, 'silhouette': 0.06784786833244377, 'adjusted_rand_score': 0.5567235875882797, 'normalized_mutual_info_score': 0.6480688386997159, 'homogeneity_score': 0.5929749209424997, 'completeness_score': 0.7144490437802022, 'v_measure_score': 0.648068838699716, 'calinski_harabasz_score': 52.24038899425678, 'davies_bouldin_score': 3.5453223449616496, 'coherence': 0.45885083173118413}

Testing k=4, reduced_dim=150
{'kappa': 0.66125, 'silhouette': 0.052522380916958586, 'adjusted_rand_score': 0.5528026649331749, 'normalized_mutual_info_

In [None]:
feat['Word2Vec']

array([[-0.26928735, -0.1195639 , -0.1158248 , ...,  0.02564578,
        -0.27594167, -0.01948083],
       [-0.29564297, -0.15835945, -0.1122013 , ...,  0.03395563,
        -0.29371753,  0.00676113],
       [-0.2708385 , -0.09308651, -0.12021372, ...,  0.02736022,
        -0.24739093, -0.02630153],
       ...,
       [-0.2529159 , -0.14138374, -0.09418692, ...,  0.05158507,
        -0.26257536,  0.00562867],
       [-0.19395056, -0.05863532, -0.11876706, ...,  0.00516182,
        -0.23402001, -0.0468999 ],
       [-0.17323664, -0.07238069, -0.1126738 , ..., -0.00946106,
        -0.2348315 , -0.03629252]], dtype=float32)