# Group -8
# K-means clustering via PCA

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from collections import defaultdict
from sklearn import metrics
from time import time

In [2]:
# function to evaluates the clustering accurracy 
evaluations = []
evaluations_std = []


def fit_and_evaluate(km, X, name=None, n_runs=5):
    name = km.__class__.__name__ if name is None else name

    train_times = []
    scores = defaultdict(list)
    for seed in range(n_runs):
        km.set_params(random_state=seed)
        t0 = time()
        km.fit(X)
        train_times.append(time() - t0)
        scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
        scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
        scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
        scores["Adjusted Rand-Index"].append(
            metrics.adjusted_rand_score(labels, km.labels_)
        )
        scores["Silhouette Coefficient"].append(
            metrics.silhouette_score(X, km.labels_, sample_size=2000)
        )
    train_times = np.asarray(train_times)

    print(f"\tclustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s ")
    evaluation = {
        "estimator": name,
        "train_time": train_times.mean(),
    }
    evaluation_std = {
        "estimator": name,
        "train_time": train_times.std(),
    }
    for score_name, score_values in scores.items():
        mean_score, std_score = np.mean(score_values), np.std(score_values)
        print(f"\t{score_name}: {mean_score:.3f} ± {std_score:.3f}")
        evaluation[score_name] = mean_score
        evaluation_std[score_name] = std_score
    evaluations.append(evaluation)
    evaluations_std.append(evaluation_std)

In [3]:
# get the A5 dataset
categories = [
    "comp.graphics",
    "rec.motorcycles",
    "rec.sport.baseball",
    "sci.space",
    "talk.politics.mideast"
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)

labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

print(f"{len(dataset.data)} documents - {true_k} categories")


4890 documents - 5 categories


In [4]:
# get the no. of documents as mentioned in the paper
req_freq = [200,140,120,100,60]
curr_freq = [0,0,0,0,0]
new_data = {
    "data":[],
    "target":[]
}

for i in range(0,len(dataset.data)):
    category = dataset.target[i]
    if curr_freq[category] < req_freq[category]:
        curr_freq[category]+=1
        new_data["data"].append(dataset.data[i])
        new_data["target"].append(dataset.target[i])
    elif curr_freq==req_freq:
        break
labels = new_data["target"]

In [5]:
# generate word-document relationship using TF-IFD method
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
    max_features=1000
)
X_tfidf = vectorizer.fit_transform(new_data["data"])

print(f"n_samples: {X_tfidf.shape[0]},\nn_features: {X_tfidf.shape[1]}")

n_samples: 620,
n_features: 1000


In [6]:
# get the sparcity of the data
print(f"{X_tfidf.nnz / np.prod(X_tfidf.shape):.3f}")

0.023


In [7]:
# kmeans without pca
kmeans = KMeans(
    n_clusters=true_k,
    random_state=42,
    max_iter=100,
    n_init=5,
)
print("---------------------------------------------------------------\nK - means without PCA \n")
fit_and_evaluate(kmeans, X_tfidf, name="KMeans\non tf-idf vectors")

---------------------------------------------------------------
K - means without PCA 

	clustering done in 0.08 ± 0.01 s 
	Homogeneity: 0.298 ± 0.049
	Completeness: 0.307 ± 0.040
	V-measure: 0.302 ± 0.044
	Adjusted Rand-Index: 0.224 ± 0.063
	Silhouette Coefficient: 0.005 ± 0.003


In [8]:
# k means using pca on the tf-ifd matrix 
dimension=int(input("Enter the dimension: "))

pca = make_pipeline(TruncatedSVD(n_components=dimension,random_state=42), Normalizer(copy=False))
X_pca = pca.fit_transform(X_tfidf)

kmeans = KMeans(
    n_clusters=true_k,
    random_state=42,
    max_iter=100,
    n_init=5,
)
print("---------------------------------------------------------------\nK - means using PCA \n")
fit_and_evaluate(kmeans, X_pca, name="KMeans\nwith PCA on tf-idf vectors")

Enter the dimension: 100
---------------------------------------------------------------
K - means using PCA 

	clustering done in 0.29 ± 0.07 s 
	Homogeneity: 0.342 ± 0.053
	Completeness: 0.364 ± 0.056
	V-measure: 0.353 ± 0.054
	Adjusted Rand-Index: 0.248 ± 0.066
	Silhouette Coefficient: 0.024 ± 0.002
