In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn.metrics import cohen_kappa_score, silhouette_score, confusion_matrix, adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, calinski_harabasz_score, davies_bouldin_score

from itertools import cycle
import joblib
from sklearn.decomposition import PCA

from collections import Counter
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [None]:
# Run this ONLY IF Yellowbrick shows "findfont" errors

import matplotlib.font_manager
plt.rcParams['font.family'] = 'DejaVu Sans'

In [None]:
def combine_features(*arrays):
    return np.hstack(arrays)


In [None]:
DPPpath = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/'
def load_feature_set(path):

    # Load preprocessed features
    df_tfidf = pd.read_pickle(DPPpath + 'features_tfidf.pkl')
    df_bow = pd.read_pickle(DPPpath + 'features_bow.pkl')
    df_lda = pd.read_pickle(DPPpath + 'features_lda.pkl')
    df_wv = pd.read_pickle(DPPpath + 'features_word2vec.pkl')

    # Prepare data
    labels = df_tfidf['label']
    numeric_labels = df_tfidf['label_num'].values
    classes = np.unique(numeric_labels)
    from sklearn.preprocessing import label_binarize
    y_binarized = label_binarize(numeric_labels, classes=classes)

    # Feature matrices (drop label columns)
    X_tfidf = df_tfidf.drop(['label', 'label_num'], axis=1).values
    X_bow = df_bow.drop(['label', 'label_num'], axis=1).values
    X_lda = df_lda.drop(['label', 'label_num'], axis=1).values
    X_wv = df_wv.drop(['label', 'label_num'], axis=1).values

    feature_sets = {
        'TF-IDF': X_tfidf,
        'BoW': X_bow,
        'LDA': X_lda,
        'Word2Vec': X_wv,
        'TF-IDF + LDA': combine_features(X_tfidf, X_lda),
        'BoW + LDA': combine_features(X_bow, X_lda)
    }

    return labels, numeric_labels, feature_sets

In [None]:

from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
def align_cluster_labels(true_labels, pred_labels):

    cm = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-cm)
    label_map = {pred: true for true, pred in zip(row_ind, col_ind)}
    aligned_labels = np.array([label_map[p] for p in pred_labels])
    return aligned_labels

In [None]:
from sklearn.manifold import TSNE
from scipy.sparse import issparse

def plot_clusters(X, labels, method_name):
    """Visualize clusters using t-SNE"""
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X.toarray() if issparse(X) else X)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='viridis')
    plt.title(f'{method_name} Clustering (t-SNE visualization)')
    plt.colorbar(scatter)
    plt.show()

In [None]:
def spectral_clustering(X, actual_y, k=5, vect_path="", raw_path="", plotting=False, reduced_dim = None):
    # Perform Spectral Clustering
    if reduced_dim:
        pca = PCA(n_components=reduced_dim, random_state=42)
        X = pca.fit_transform(X)


    results = {}
    model = SpectralClustering(n_clusters=k, affinity='nearest_neighbors', random_state=42)
    cluster_labels = model.fit_predict(X)

    aligned_labels = align_cluster_labels(actual_y, cluster_labels)

    kappa = cohen_kappa_score(actual_y, aligned_labels)
    results['kappa'] = cohen_kappa_score(actual_y, aligned_labels)
    results['silhouette'] = silhouette_score(X, aligned_labels)
    results['adjusted_rand_score'] = adjusted_rand_score(actual_y, aligned_labels)
    results['normalized_mutual_info_score'] = normalized_mutual_info_score(actual_y, aligned_labels)
    results['homogeneity_score'] = homogeneity_score(actual_y, aligned_labels)
    results['completeness_score'] = completeness_score(actual_y, aligned_labels)
    results['v_measure_score'] = v_measure_score(actual_y, aligned_labels)
    results['calinski_harabasz_score'] = calinski_harabasz_score(X, aligned_labels)
    results['davies_bouldin_score'] = davies_bouldin_score(X, aligned_labels)
    # print(f"Cohen's Kappa (after alignment): {kappa:.3f}")

    if raw_path != "":
        top_words = []
        doc_raw = pd.read_pickle(raw_path)
        documents = doc_raw['text'].astype(str).tolist()

        for cluster in range(k):
            # Get indices of documents in this cluster
            cluster_indices = np.where(cluster_labels == cluster)[0]

            # Collect all words from documents in this cluster
            cluster_words = []
            for idx in cluster_indices:
                cluster_words.extend(documents[idx].split())

            # Get most frequent words in this cluster
            word_counts = Counter(cluster_words)
            top_words.append([word for word, count in word_counts.most_common(10)])

        coherence_model = CoherenceModel(
            topics=top_words,
            texts=[doc.split() for doc in documents],
            dictionary=Dictionary([doc.split() for doc in documents]),
            coherence='c_v'
        )
        results['coherence'] = coherence_model.get_coherence()
        print(f"Coherence Score: {coherence_model.get_coherence():.3f}")
    if plotting:
        plot_clusters(X, aligned_labels, "K-Means")

        # --- Plot Cluster Distribution (with aligned labels) ---
        df = pd.DataFrame({'Actual': actual_y, 'Cluster': aligned_labels})
        counts = df.groupby(['Actual', 'Cluster']).size().unstack()

        counts.plot(kind='bar', stacked=True, figsize=(8, 4))
        plt.title('Distribution of Actual Labels Across Clusters (Aligned)')
        plt.ylabel('Count')
        plt.xlabel('Actual Label')
        plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1))
        plt.tight_layout()
        plt.show()

    return aligned_labels, results

In [None]:
# Paths
DPPpath = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/'
vecpath = DPPpath + 'vectorizer_tfidf.pkl'
excel_path = '/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx'

# Load your Excel data
doc_raw = pd.read_excel(excel_path)

texts = doc_raw['Text'].astype(str).tolist()
numeric_labels = doc_raw['label_num'].astype(int).tolist()

# Load features as before
labels, numeric_labels_from_features, feat = load_feature_set(DPPpath)

# Now call your clustering function using the Excel raw data instead of raw_labeled.pkl
labelset = spectral_clustering(feat['TF-IDF'], numeric_labels, 5, vect_path=vecpath, raw_data=texts)


In [None]:
from itertools import product

def cohen_grid_search(X, actual_y,
                      k_values=[4, 5, 6, 7],
                      reduced_dims=[50, 100, 200],
                      vect_path="", raw_path="",
                      plotting=False):
    """
    Grid search wrapper for cohen_single. Evaluates combinations of k and reduced_dim.
    """
    best_score = -np.inf
    best_params = {}
    all_results = []

    for k, dim in product(k_values, reduced_dims):
        print(f"\nTesting k={k}, reduced_dim={dim}")
        labels, results = spectral_clustering(X, actual_y, k=k, reduced_dim=dim,
                               vect_path=vect_path, raw_path=raw_path,
                               plotting=plotting)

        kappa = results['kappa']
        all_results.append({'k': k, 'reduced_dim': dim, 'kappa': kappa, **results})

        if kappa > best_score:
            best_score = kappa
            best_params = {'k': k, 'reduced_dim': dim, 'results': results}

    print("\nBest Parameters:")
    print(f"k: {best_params['k']}, reduced_dim: {best_params['reduced_dim']}, kappa: {best_score:.3f}")
    return best_params, all_results


In [None]:

best_config, all_trials = cohen_grid_search(
    X=feat['TF-IDF'],
    actual_y=numeric_labels,
    k_values=[4,5,6,7],
    reduced_dims=[None, 50, 100, 150, 200],
    vect_path=vecpath,
    raw_path=rawpath,
    plotting=False
)


Testing k=4, reduced_dim=None
Coherence Score: 0.450

Testing k=4, reduced_dim=50
Coherence Score: 0.453

Testing k=4, reduced_dim=100
Coherence Score: 0.453

Testing k=4, reduced_dim=150
Coherence Score: 0.468

Testing k=4, reduced_dim=200
Coherence Score: 0.462

Testing k=5, reduced_dim=None
Coherence Score: 0.463

Testing k=5, reduced_dim=50
Coherence Score: 0.457

Testing k=5, reduced_dim=100
Coherence Score: 0.466

Testing k=5, reduced_dim=150
Coherence Score: 0.465

Testing k=5, reduced_dim=200
Coherence Score: 0.474

Testing k=6, reduced_dim=None
Coherence Score: 0.480

Testing k=6, reduced_dim=50
Coherence Score: 0.478

Testing k=6, reduced_dim=100
Coherence Score: 0.483

Testing k=6, reduced_dim=150
Coherence Score: 0.480

Testing k=6, reduced_dim=200
Coherence Score: 0.490

Testing k=7, reduced_dim=None
Coherence Score: 0.485

Testing k=7, reduced_dim=50
Coherence Score: 0.485

Testing k=7, reduced_dim=100
Coherence Score: 0.483

Testing k=7, reduced_dim=150
Coherence Score:

In [None]:
all_trials

[{'k': 4,
  'reduced_dim': None,
  'kappa': 0.6825,
  'silhouette': 0.021831767404411874,
  'adjusted_rand_score': 0.619936691745624,
  'normalized_mutual_info_score': 0.6575079847085625,
  'homogeneity_score': 0.6087519433060182,
  'completeness_score': 0.7147539152876757,
  'v_measure_score': 0.6575079847085626,
  'calinski_harabasz_score': 14.272810193133497,
  'davies_bouldin_score': 6.879555318063597,
  'coherence': 0.4500810084944156},
 {'k': 4,
  'reduced_dim': 50,
  'kappa': 0.6912499999999999,
  'silhouette': 0.0984549672258711,
  'adjusted_rand_score': 0.6212366557807552,
  'normalized_mutual_info_score': 0.6862999632819908,
  'homogeneity_score': 0.6307251898974828,
  'completeness_score': 0.752614728668816,
  'v_measure_score': 0.6862999632819907,
  'calinski_harabasz_score': 79.50596762494177,
  'davies_bouldin_score': 2.8047252269939915,
  'coherence': 0.45287261609643165},
 {'k': 4,
  'reduced_dim': 100,
  'kappa': 0.65375,
  'silhouette': 0.060067425066619856,
  'adjust