# Comparison between our Kmeans and Python's Kmeans++

In [1]:
# IMPORTATIONS
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from scipy.spatial import distance
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score

In [2]:
# PREPROCESSING DATA
df_text = pd.read_csv('clean_text.csv')
df_text['clean_Text'].fillna("", inplace = True)
df_text = df_text['clean_Text'][:1000]
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(list(df_text))
svd = TruncatedSVD(n_components=750)
X_svd = svd.fit_transform(X_tfidf)

In [4]:
# TRAINING PYTHON KMEANS
model = KMeans(n_clusters=5,init="k-means++")
model.fit(X_svd)
pred = model.fit_predict(X_svd)

In [5]:
kmeans_silhouette = silhouette_score(X_svd, model.labels_)
kmeans_silhouette

0.013255910915394041

The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). To clarify, b is the distance between a sample and the nearest cluster that the sample is not a part of. Note that Silhouette Coefficient is only defined if $ 2 <= $ n_labels$ <= $n_samples$ - 1 $.

The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [24]:
def k_means(X,k,max_iter=400):
    
    if isinstance(X, pd.DataFrame):
        X = X.values
    
    #Pick indices of k random point without replacement
    idx = np.random.choice(len(X), k, replace=False)
    centroids = X[idx, :]
    
    #Calculate the class of each point using euclidean distance
    C = np.argmin(distance.cdist(X, centroids, 'euclidean'),axis=1)
    for _ in range(max_iter):
        centroids = np.vstack([X[C==i,:].mean(axis=0) for i in range(k)])
        t = np.argmin(distance.cdist(X, centroids, 'euclidean'),axis=1)
        
        #Stop when C matrix doesn't change
        if np.array_equal(C,t):
            break
            
        C = t
    
    # return an array containg class of each data point, and the centroids
    return C,centroids

In [30]:
silhouette_score(X_svd,k_means(X_svd, 5)[0])

0.01251597359237147