- App B: News categorisation
  - Traditional: TF-IDF + K-means
  - Neural: BERT-sentence + K-means
  - Metrics: Silhouette, Davies Bouldin, Calinski Harabasz scores

In [47]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

from sentence_transformers import SentenceTransformer

import re
import spacy

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

SAMPLE_SIZE = 10000
NUM_CLUSTERS = 8
MAX_DOCS_FOR_METRICS = 3000  

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)      # remove URLs
    text = re.sub(r"\(cnn\)\s*--", "", text)        # remove minimal CNN header
    text = re.sub(r"[^\w\s]", " ", text)            # remove punctuation
    text = re.sub(r"\d+", " ", text)                # remove numbers
    text = re.sub(r"\s+", " ", text).strip()        # normalize whitespace

    return text

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  
def lemmatise(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.lemma_.isalpha()]

In [48]:
trainAll_df = pd.read_csv("../../data/Dataset2/train.csv")

print(trainAll_df.columns)
trainAll_df.head()

train_df = trainAll_df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)

Index(['id', 'article', 'highlights'], dtype='object')


In [49]:
print("Preprocessing text... this may take a few minutes.")

train_df['clean_article'] = train_df['article'].apply(preprocess)

print(train_df[['article', 'clean_article']].head())

Preprocessing text... this may take a few minutes.
                                             article  \
0  By . Mia De Graaf . Britons flocked to beaches...   
1  A couple who weighed a combined 32st were sham...   
2  Video footage shows the heart stopping moment ...   
3  Istanbul, Turkey (CNN) -- About 250 people rac...   
4  By . Daily Mail Reporter . PUBLISHED: . 12:53 ...   

                                       clean_article  
0  by mia de graaf britons flocked to beaches acr...  
1  a couple who weighed a combined st were shamed...  
2  video footage shows the heart stopping moment ...  
3  istanbul turkey about people raced across the ...  
4  by daily mail reporter published est january u...  


Traditional Model

In [50]:
# TF-IDF w stop-word removal & lemmantization
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=lemmatise,
    max_features=10000,
    stop_words="english"
)

print("Fitting TF-IDF vectorizer...")
tfidf_features = tfidf_vectorizer.fit_transform(train_df['clean_article'])

print("TF-IDF feature shape:", tfidf_features.shape)

# K-means
kmeans_tfidf = KMeans(
    n_clusters=NUM_CLUSTERS,
    random_state=RANDOM_SEED,
    n_init=10
)

print("Fitting KMeans on TF-IDF features...")
kmeans_tfidf.fit(tfidf_features)

tfidf_cluster_labels = kmeans_tfidf.labels_

Fitting TF-IDF vectorizer...




TF-IDF feature shape: (10000, 10000)
Fitting KMeans on TF-IDF features...


In [51]:
# Subsample for metrics
num_samples_for_metrics = min(MAX_DOCS_FOR_METRICS, tfidf_features.shape[0])
eval_indices_tfidf = np.random.choice(
    tfidf_features.shape[0],
    size=num_samples_for_metrics,
    replace=False
)

tfidf_eval = tfidf_features[eval_indices_tfidf].toarray()
tfidf_labels_eval = tfidf_cluster_labels[eval_indices_tfidf]

silhouette_tfidf = silhouette_score(tfidf_eval, tfidf_labels_eval)
davies_bouldin_tfidf = davies_bouldin_score(tfidf_eval, tfidf_labels_eval)
calinski_harabasz_tfidf = calinski_harabasz_score(tfidf_eval, tfidf_labels_eval)

print("TF-IDF + KMeans metrics:")
print("  Silhouette score       :", silhouette_tfidf)
print("  Davies-Bouldin index   :", davies_bouldin_tfidf)
print("  Calinski-Harabasz index:", calinski_harabasz_tfidf)

TF-IDF + KMeans metrics:
  Silhouette score       : 0.005171798552842779
  Davies-Bouldin index   : 9.410189114642181
  Calinski-Harabasz index: 11.605209830411253


Neural Model   

In [52]:
# Sentence-BERT
sbert_model_name = "all-MiniLM-L6-v2" 
sbert_model = SentenceTransformer(sbert_model_name)

print(f"Encoding articles with Sentence-BERT model: {sbert_model_name}...")
sbert_embeddings = sbert_model.encode(
    train_df['clean_article'],
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True, 
)

print("Sentence-BERT embedding shape:", sbert_embeddings.shape)

# K-means
kmeans_sbert = KMeans(
    n_clusters=NUM_CLUSTERS,
    random_state=RANDOM_SEED,
    n_init=10
)

print("Fitting KMeans on Sentence-BERT embeddings...")
kmeans_sbert.fit(sbert_embeddings)

sbert_cluster_labels = kmeans_sbert.labels_


Encoding articles with Sentence-BERT model: all-MiniLM-L6-v2...


Batches: 100%|██████████| 313/313 [11:26<00:00,  2.19s/it]


Sentence-BERT embedding shape: (10000, 384)
Fitting KMeans on Sentence-BERT embeddings...


In [53]:
num_samples_for_metrics_sbert = min(MAX_DOCS_FOR_METRICS, sbert_embeddings.shape[0])
eval_indices_sbert = np.random.choice(
    sbert_embeddings.shape[0],
    size=num_samples_for_metrics_sbert,
    replace=False
)

sbert_eval = sbert_embeddings[eval_indices_sbert]
sbert_labels_eval = sbert_cluster_labels[eval_indices_sbert]

silhouette_sbert = silhouette_score(sbert_eval, sbert_labels_eval)
davies_bouldin_sbert = davies_bouldin_score(sbert_eval, sbert_labels_eval)
calinski_harabasz_sbert = calinski_harabasz_score(sbert_eval, sbert_labels_eval)

print("Sentence-BERT + KMeans metrics:")
print("  Silhouette score       :", silhouette_sbert)
print("  Davies-Bouldin index   :", davies_bouldin_sbert)
print("  Calinski-Harabasz index:", calinski_harabasz_sbert)

Sentence-BERT + KMeans metrics:
  Silhouette score       : 0.03162845
  Davies-Bouldin index   : 4.552164778063304
  Calinski-Harabasz index: 56.79351636522212


Comparison

In [54]:
metrics_comparison_df = pd.DataFrame(
    {
        "Model": ["TF-IDF + KMeans", "Sentence-BERT + KMeans"],
        "Silhouette": [silhouette_tfidf, silhouette_sbert],
        "Davies_Bouldin": [davies_bouldin_tfidf, davies_bouldin_sbert],
        "Calinski_Harabasz": [calinski_harabasz_tfidf, calinski_harabasz_sbert],
    }
)

metrics_comparison_df

Unnamed: 0,Model,Silhouette,Davies_Bouldin,Calinski_Harabasz
0,TF-IDF + KMeans,0.005172,9.410189,11.60521
1,Sentence-BERT + KMeans,0.031628,4.552165,56.793516
