In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import cohen_kappa_score, silhouette_score, confusion_matrix, adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, calinski_harabasz_score, davies_bouldin_score

from itertools import cycle
import joblib

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Run this ONLY IF Yellowbrick shows "findfont" errors

import matplotlib.font_manager
plt.rcParams['font.family'] = 'DejaVu Sans'

In [None]:
def combine_features(*arrays):
    return np.hstack(arrays)


In [None]:
DPPpath = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/'
def load_feature_set(path):

    # Load preprocessed features
    df_tfidf = pd.read_pickle(DPPpath + 'features_tfidf.pkl')
    df_bow = pd.read_pickle(DPPpath + 'features_bow.pkl')
    df_lda = pd.read_pickle(DPPpath + 'features_lda.pkl')
    df_wv = pd.read_pickle(DPPpath + 'features_word2vec.pkl')

    # Prepare data
    labels = df_tfidf['label']
    numeric_labels = df_tfidf['label_num'].values
    classes = np.unique(numeric_labels)
    from sklearn.preprocessing import label_binarize
    y_binarized = label_binarize(numeric_labels, classes=classes)

    # Feature matrices (drop label columns)
    X_tfidf = df_tfidf.drop(['label', 'label_num'], axis=1).values
    X_bow = df_bow.drop(['label', 'label_num'], axis=1).values
    X_lda = df_lda.drop(['label', 'label_num'], axis=1).values
    X_wv = df_wv.drop(['label', 'label_num'], axis=1).values

    feature_sets = {
        'TF-IDF': X_tfidf,
        'BoW': X_bow,
        'LDA': X_lda,
        'Word2Vec': X_wv,
        'TF-IDF + LDA': combine_features(X_tfidf, X_lda),
        'BoW + LDA': combine_features(X_bow, X_lda)
    }

    return labels, numeric_labels, feature_sets

In [None]:

from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
def align_cluster_labels(true_labels, pred_labels):

    cm = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-cm)
    label_map = {pred: true for true, pred in zip(row_ind, col_ind)}
    aligned_labels = np.array([label_map[p] for p in pred_labels])
    return aligned_labels

In [None]:
from sklearn.manifold import TSNE
from scipy.sparse import issparse
from sklearn.decomposition import PCA

def plot_clusters(X, labels, method_name):
    """Visualize clusters using t-SNE"""
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X.toarray() if issparse(X) else X)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='viridis')
    plt.title(f'{method_name} Clustering (t-SNE visualization)')
    plt.colorbar(scatter)
    plt.show()

In [None]:


def ward_hierarchical(X, actual_y, k=5, vect_path="", raw_path="", reduced_dim = None, plotting=False):
    # Perform Ward Hierarchical Clustering
    if reduced_dim:
        pca = PCA(n_components=reduced_dim, random_state=42)
        X = pca.fit_transform(X)

    model = AgglomerativeClustering(n_clusters=k, linkage='ward')
    cluster_labels = model.fit_predict(X)

    aligned_labels = align_cluster_labels(actual_y, cluster_labels)
    # Record Cohen Kappa
    kappa = cohen_kappa_score(actual_y, aligned_labels)
    # print(f"Cohen's Kappa: {kappa:.3f}")
    results = {}
    results['kappa'] = cohen_kappa_score(actual_y, aligned_labels)
    results['silhouette'] = silhouette_score(X, aligned_labels)
    results['adjusted_rand_score'] = adjusted_rand_score(actual_y, aligned_labels)
    results['normalized_mutual_info_score'] = normalized_mutual_info_score(actual_y, aligned_labels)
    results['homogeneity_score'] = homogeneity_score(actual_y, aligned_labels)
    results['completeness_score'] = completeness_score(actual_y, aligned_labels)
    results['v_measure_score'] = v_measure_score(actual_y, aligned_labels)
    results['calinski_harabasz_score'] = calinski_harabasz_score(X, aligned_labels)
    results['davies_bouldin_score'] = davies_bouldin_score(X, aligned_labels)
    if plotting:
        # Plot dendrogram (optional)
        linked = linkage(X, 'ward')
        plt.figure(figsize=(10, 7))
        dendrogram(linked, truncate_mode='level', p=5)
        plt.title('Dendrogram')
        plt.xlabel('Sample index')
        plt.ylabel('Distance')
        plt.show()

    if vect_path and raw_path:
        vectorizer = joblib.load(vect_path)
        doc_raw = pd.read_pickle(raw_path)
        documents = doc_raw['text'].astype(str).tolist()

        # For hierarchical clustering, use TF-IDF to find top words per cluster
        top_words = []
        for cluster in range(k):
            cluster_docs = X[cluster_labels == cluster]
            if issparse(cluster_docs):
                cluster_docs = cluster_docs.toarray()
            centroid = np.mean(cluster_docs, axis=0)
            top_indices = centroid.argsort()[-10:][::-1]
            top_words.append([vectorizer.get_feature_names_out()[i] for i in top_indices])

        coherence_model = CoherenceModel(
            topics=top_words,
            texts=[doc.split() for doc in documents],
            dictionary=Dictionary([doc.split() for doc in documents]),
            coherence='c_v'
        )
        # print(f"Coherence Score: {coherence_model.get_coherence():.3f}")

    # Plot cluster distribution
    if plotting:
        df = pd.DataFrame({'Actual': actual_y, 'Cluster': aligned_labels})
        counts = df.groupby(['Actual', 'Cluster']).size().unstack()

        # Plot
        counts.plot(kind='bar', stacked=True, figsize=(8, 4))
        plt.title('Distribution of Actual Labels Across Clusters')
        plt.ylabel('Count')
        plt.xlabel('Actual Label')
        plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1))
        plt.tight_layout()
        plt.show()

    return results

In [None]:
import pandas as pd

DPPpath = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/'
vecpath = DPPpath + 'vectorizer_tfidf.pkl'

# Load features and labels as before
labels, numeric_labels, feat = load_feature_set(DPPpath)

# Load your Excel raw data
excel_path = '/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx'
doc_raw = pd.read_excel(excel_path)

# Extract the raw texts (adjust column name if needed)
texts = doc_raw['Text'].astype(str).tolist()

# Call ward_hierarchical but pass raw texts directly (instead of raw_path)
# Note: This assumes you modify ward_hierarchical to accept raw_data (list of texts) instead of raw_path (file path).
ward_hierarchical(feat['TF-IDF'], numeric_labels, 5, vect_path=vecpath, raw_data=texts)


{'kappa': 0.74,
 'silhouette': 0.017500982135866948,
 'adjusted_rand_score': 0.5505774555753853,
 'normalized_mutual_info_score': 0.5675048378537947,
 'homogeneity_score': 0.5643816595894303,
 'completeness_score': 0.5706627745861218,
 'v_measure_score': 0.5675048378537947,
 'calinski_harabasz_score': 11.409826898364956,
 'davies_bouldin_score': 7.473183307628477}

In [None]:
from itertools import product

def cohen_grid_search(X, actual_y,
                      k_values=[3, 5, 7],
                      reduced_dims=[None, 50, 100],
                      vect_path="", raw_path="",
                      plotting=False,
                      metric_name = 'adjusted_rand_score'):
    """
    Grid search wrapper for cohen_single. Evaluates combinations of k and reduced_dim.
    """
    best_score = -np.inf
    best_params = {}
    all_results = []

    for k, dim in product(k_values, reduced_dims):
        print(f"\nTesting k={k}, reduced_dim={dim}")
        results = ward_hierarchical(X, actual_y, k=k, reduced_dim=dim,
                               raw_path=raw_path,
                               plotting=plotting)

        print(results)
        metric = results[metric_name]
        all_results.append({'k': k, 'reduced_dim': dim, **results})

        if metric > best_score:
            best_score = metric
            best_params = {'k': k, 'reduced_dim': dim, 'results': results}

    print("\nBest Parameters:")
    print(best_params)
    return best_params, all_results


In [None]:

best_config, all_trials = cohen_grid_search(
    X=feat['TF-IDF'],
    actual_y=numeric_labels,
    k_values=[4,5,6,7],
    reduced_dims=[None, 50, 100, 150, 200],
    vect_path=vecpath,
    raw_path=rawpath,
    plotting=False
)


Testing k=4, reduced_dim=None
{'kappa': 0.5700000000000001, 'silhouette': 0.016358966410025298, 'adjusted_rand_score': 0.41499100003231326, 'normalized_mutual_info_score': 0.5170356730806692, 'homogeneity_score': 0.4650556890683218, 'completeness_score': 0.5820976113442734, 'v_measure_score': 0.5170356730806692, 'calinski_harabasz_score': 13.118790702393978, 'davies_bouldin_score': 6.960196095378421}

Testing k=4, reduced_dim=50
{'kappa': 0.6275, 'silhouette': 0.08870052574929986, 'adjusted_rand_score': 0.4966696951470031, 'normalized_mutual_info_score': 0.5877587966361044, 'homogeneity_score': 0.5343221813438486, 'completeness_score': 0.653071301143909, 'v_measure_score': 0.5877587966361045, 'calinski_harabasz_score': 74.5731060419425, 'davies_bouldin_score': 2.8911310593257222}

Testing k=4, reduced_dim=100
{'kappa': 0.65125, 'silhouette': 0.053604701238741156, 'adjusted_rand_score': 0.5540307475010341, 'normalized_mutual_info_score': 0.5830330465039794, 'homogeneity_score': 0.54015

In [None]:
rawpath = DPPpath + 'raw_labeled.pkl'
vecpath = DPPpath + 'vectorizer_bow.pkl'
best_config, all_trials = cohen_grid_search(
    X=feat['BoW'],
    actual_y=numeric_labels,
    k_values=[4,5,6,7],
    reduced_dims=[None, 50, 100, 150, 200],
    vect_path=vecpath,
    raw_path=rawpath,
    plotting=False
)


Testing k=4, reduced_dim=None
Cohen's Kappa: 0.656
Coherence Score: 0.450

Testing k=4, reduced_dim=50
Cohen's Kappa: 0.591
Coherence Score: 0.600

Testing k=4, reduced_dim=100
Cohen's Kappa: 0.584
Coherence Score: 0.598

Testing k=4, reduced_dim=150
Cohen's Kappa: 0.590
Coherence Score: 0.591

Testing k=4, reduced_dim=200
Cohen's Kappa: 0.554
Coherence Score: 0.577

Testing k=5, reduced_dim=None
Cohen's Kappa: 0.746
Coherence Score: 0.461

Testing k=5, reduced_dim=50
Cohen's Kappa: 0.701
Coherence Score: 0.581

Testing k=5, reduced_dim=100
Cohen's Kappa: 0.490
Coherence Score: 0.595

Testing k=5, reduced_dim=150
Cohen's Kappa: 0.722
Coherence Score: 0.594

Testing k=5, reduced_dim=200
Cohen's Kappa: 0.681
Coherence Score: 0.580

Testing k=6, reduced_dim=None
Cohen's Kappa: 0.659
Coherence Score: 0.472

Testing k=6, reduced_dim=50
Cohen's Kappa: 0.694
Coherence Score: 0.590

Testing k=6, reduced_dim=100
Cohen's Kappa: 0.556
Coherence Score: 0.600

Testing k=6, reduced_dim=150
Cohen's 

In [None]:

best_config, all_trials = cohen_grid_search(
    X=feat['Word2Vec'],
    actual_y=numeric_labels,
    k_values=[4,5,6,7],
    reduced_dims=[None],
    vect_path=None,
    raw_path=None,
    plotting=False
)


Testing k=4, reduced_dim=None
Cohen's Kappa: 0.081

Testing k=5, reduced_dim=None
Cohen's Kappa: 0.085

Testing k=6, reduced_dim=None
Cohen's Kappa: 0.108

Testing k=7, reduced_dim=None
Cohen's Kappa: 0.123

Best Parameters:
k: 7, reduced_dim: None, kappa: 0.123
