In [None]:
# scripts/clustering.py
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

DATA = Path("data")
FIGS = Path("figures"); FIGS.mkdir(parents=True, exist_ok=True)
RANDOM_STATE = 42

# 1) Load like your other scripts
df_true = pd.read_csv(DATA/"True.csv"); df_true["label"] = 1
df_fake = pd.read_csv(DATA/"Fake.csv"); df_fake["label"] = 0
df = pd.concat([df_true, df_fake], ignore_index=True)
df["text"] = df["text"].astype(str).fillna("")

# 2) TF-IDF → SVD(50) for dense features (better for clustering high-dim text)
vec = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=2)
X_tfidf = vec.fit_transform(df["text"])
svd = TruncatedSVD(n_components=50, random_state=RANDOM_STATE)
X = svd.fit_transform(X_tfidf)
X2 = StandardScaler().fit_transform(X)  # keep a scaled version for GMM

# small helper to plot 2D scatter (using first two SVD components)
def scatter_2d(points, labels, title, outpath):
    plt.figure(figsize=(7,5.4))
    plt.scatter(points[:,0], points[:,1], c=labels, s=10, alpha=0.8)
    plt.title(title)
    plt.xlabel("SVD-1"); plt.ylabel("SVD-2")
    plt.tight_layout(); plt.savefig(outpath, dpi=160); plt.close()

# 3) K-means (k=2 by default to mirror Fake/True)
kmeans = KMeans(n_clusters=2, random_state=RANDOM_STATE, n_init="auto")
km_labels = kmeans.fit_predict(X)
km_sil = silhouette_score(X, km_labels)
scatter_2d(X[:, :2], km_labels, f"K-means (k=2), silhouette={km_sil:.3f}", FIGS/"cluster_kmeans.png")

# 4) Gaussian Mixture (aka mixtures of Gaussians)
gmm = GaussianMixture(n_components=2, random_state=RANDOM_STATE, covariance_type="full")
gmm_labels = gmm.fit_predict(X2)
gmm_sil = silhouette_score(X2, gmm_labels)
scatter_2d(X[:, :2], gmm_labels, f"GMM (k=2), silhouette={gmm_sil:.3f}", FIGS/"cluster_gmm.png")

# 5) Hierarchical (Agglomerative)
agg = AgglomerativeClustering(n_clusters=2, linkage="ward")
agg_labels = agg.fit_predict(X)  # Ward expects Euclidean / dense
agg_sil = silhouette_score(X, agg_labels)
scatter_2d(X[:, :2], agg_labels, f"Agglomerative (k=2), silhouette={agg_sil:.3f}", FIGS/"cluster_agg.png")

# 6) Quick report
print("Silhouette scores (higher≈better)")
print(f" - KMeans: {km_sil:.3f}")
print(f" -   GMM: {gmm_sil:.3f}")
print(f" -  Aggl: {agg_sil:.3f}")
print("Saved plots → figures/cluster_*.png")
