In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import pandas as pd
import seaborn as sns
from sklearn.datasets.samples_generator import make_blobs, make_circles, make_moons
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score

In [3]:
df_profiles = pd.read_csv('../data/raw/profiles.csv')
df_profiles

Unnamed: 0,pid,p0,p1,p2,p3,p4,p5,p6,p7,p8,...,p56,p57,p58,p59,p60,p61,p62,p63,p64,p65
0,196356,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,204083,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,170667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,115511,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,129719,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,174347,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,143618,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,116999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,194535,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,125275,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [4]:
# Standardize the data
X_std = StandardScaler().fit_transform(df_profiles)

# Run local implementation of kmeans
km = KMeans(n_clusters=4)
km.fit(X_std)
centroids = km.cluster_centers_

In [5]:
centroids

array([[-1.31316396e-02,  1.72089307e+00,  9.15664502e-02,
        -6.92127001e-01, -5.75107608e-01, -3.63450130e-01,
        -1.52423823e-01, -6.27387106e-02, -7.87396062e-02,
         7.81865944e-02, -7.43928687e-01,  8.13567414e-01,
         1.21415875e-01,  4.09406725e-02,  1.66005694e-01,
        -6.96805103e-02, -8.20352404e-02, -1.23581449e-01,
        -1.71271836e-01, -2.09433538e-01, -1.57756453e-01,
        -2.58984098e-02, -2.88156815e-01, -1.59906747e-01,
         6.25276812e-04, -1.83756412e-02,  1.50130718e-01,
         1.56578085e-01,  1.18597724e-01, -2.38330760e-01,
        -8.17576715e-02,  3.04246187e-01, -3.22216372e-01,
        -8.93472534e-02,  6.84875296e-01, -4.54670310e-01,
        -2.53896293e-02, -1.35694270e-01,  1.31573721e-01,
        -3.65681856e-04,  2.91085303e-02, -5.79122146e-03,
         2.18252714e-02, -4.91258249e-02,  9.66964402e-03,
        -1.52739335e-02, -6.69207595e-03, -2.93171247e-02,
        -2.63417376e-02,  4.74452738e-02,  3.75812207e-0

In [6]:
df_silhouette = pd.DataFrame(columns = ['cluster_type', 'n_cluster', 'avg_score'])
df_silhouette

Unnamed: 0,cluster_type,n_cluster,avg_score


In [8]:
rows = df_profiles.index

In [18]:
excerpt_rows = rows[130:500]
excerpt_rows

RangeIndex(start=130, stop=500, step=1)

In [19]:
count = []

for each in excerpt_rows:
    each = int(each)
    
    if each  not in count:
        count.append(each)

In [20]:
'''
Silhouette Analysis used to determine the degree of separation between clusters. For each sample:
1) Compute the average distance from all data points in the same cluster (ai).
2) Compute the average distance from all data points in the closest cluster (bi).
3) Compute the coefficient: Values in the interval [-1, 1]
    * 0 –> the sample is very close to the neighboring clusters.
    * 1 –> the sample is far away from the neighboring clusters.
    * -1 –> the sample is assigned to the wrong clusters.
'''

for i, k in enumerate(count):
    
    # Run the Kmeans algorithm
    km = KMeans(n_clusters=k)
    labels = km.fit_predict(X_std)
    centroids = km.cluster_centers_

    # Get silhouette samples
    silhouette_vals = silhouette_samples(X_std, labels)

    # Silhouette
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        
    # Get the average silhouette score
    avg_score = np.mean(silhouette_vals)
    
    if k % 50 == 0:
        print(f'Current n_cluster {k} with avg_score = {avg_score}')
    
    df_silhouette.loc[i, 'cluster_type'] = 'KMeans'
    df_silhouette.loc[i, 'n_cluster'] = k
    df_silhouette.loc[i, 'avg_score'] = avg_score


KeyboardInterrupt: 

In [21]:
df_silhouette

Unnamed: 0,cluster_type,n_cluster,avg_score
129,KMeans,130,0.0802393
130,KMeans,131,0.0814365
131,KMeans,132,0.0889016
132,KMeans,133,0.0775118
133,KMeans,134,0.0807942
134,KMeans,135,0.0823189
135,KMeans,136,0.0803586
136,KMeans,137,0.0834806
137,KMeans,138,0.0819528
138,KMeans,139,0.0832413


In [22]:
for i, k in enumerate([148, 149, 150]):
    
    # Run the Kmeans algorithm
    km = KMeans(n_clusters=k)
    labels = km.fit_predict(X_std)
    centroids = km.cluster_centers_

    # Get silhouette samples
    silhouette_vals = silhouette_samples(X_std, labels)

    # Silhouette
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        
    # Get the average silhouette score
    avg_score = np.mean(silhouette_vals)
    
    if k % 50 == 0:
        print(f'Current n_cluster {k} with avg_score = {avg_score}')
    
    df_silhouette.loc[i, 'cluster_type'] = 'KMeans'
    df_silhouette.loc[i, 'n_cluster'] = k
    df_silhouette.loc[i, 'avg_score'] = avg_score


Current n_cluster 150 with avg_score = 0.08154976785749042


In [23]:
df_silhouette

Unnamed: 0,cluster_type,n_cluster,avg_score
129,KMeans,130,0.0802393
130,KMeans,131,0.0814365
131,KMeans,132,0.0889016
132,KMeans,133,0.0775118
133,KMeans,134,0.0807942
134,KMeans,135,0.0823189
135,KMeans,136,0.0803586
136,KMeans,137,0.0834806
137,KMeans,138,0.0819528
138,KMeans,139,0.0832413


In [None]:
df_silhouette.to_csv("../data/task2/")

In [4]:
df_scores = pd.read_csv('../data/task2/silhouette.csv')
df_scores

Unnamed: 0.1,Unnamed: 0,cluster_type,n_cluster,avg_score
0,1,KMeans,2,0.042845
1,2,KMeans,3,0.035767
2,3,KMeans,4,0.040143
3,4,KMeans,5,0.039844
4,5,KMeans,6,0.039268
5,6,KMeans,7,0.024363
6,7,KMeans,8,0.061844
7,8,KMeans,9,0.041767
8,9,KMeans,10,0.055749
9,10,KMeans,11,0.049665


In [8]:
df_scores[df_scores.avg_score == df_scores['avg_score'].max()]

Unnamed: 0.1,Unnamed: 0,cluster_type,n_cluster,avg_score
34,35,KMeans,36,0.174719
