In [15]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_iris

import plotly.express as px
import plotly.figure_factory as ff

In [16]:
df = px.data.iris()

In [3]:
df['species'].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [4]:
df.groupby('species')['sepal_length'].mean()

species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

In [5]:
px.scatter(data_frame=df, x='sepal_length', y='sepal_width', color='species')

In [6]:
px.scatter(data_frame=df, x='petal_length', y='petal_width', color='species')

In [7]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


In [8]:
# df.drop(['species', 'species_id'], axis=1, inplace=True)

In [9]:
sc = StandardScaler()
X = sc.fit_transform(df.drop(['species', 'species_id'], axis=1))

In [22]:
km = KMeans(n_clusters=3, n_init='auto')
km.fit(X)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [23]:
matrix = pd.DataFrame(sc.inverse_transform(km.cluster_centers_), columns=df.columns[:4]).round(3)

In [24]:
matrix

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.006,3.418,1.464,0.244
1,5.834,2.677,4.421,1.436
2,6.807,3.12,5.523,1.982


In [None]:
matrix.columns.to_list()

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [None]:
ff.create_annotated_heatmap(matrix.values,
                            x=matrix.columns.to_list(),
                            y=matrix.index.to_list())

In [None]:
to_predict = sc.transform(pd.DataFrame(np.array([[4.8, 2.8, 4.9, 1.6]]), columns=df.columns[:4]))

In [None]:
km.predict(to_predict)

array([1], dtype=int32)

In [None]:
def get_inertia(model, dataset, distance):
    centroid = ...
    if distance == 'euclidian':
        inertia = ...
        return inertia
    elif distance == 'manhattan':
        inertia = ...
        return inertia

In [None]:
km.inertia_

55.94156386606016

In [None]:
wcss = []

for i in range(2, 10):
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    wcss.append(km.inertia_)

In [None]:
px.line(x=[i for i in range(2,10)], y=wcss)

In [None]:
km = KMeans(n_clusters=3)
km.fit(X)

In [None]:
silhouette_score(X, km.predict(X))

0.46214947389312017

In [None]:
silhouette_scores = []

for i in range(2, 10):
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X)
    silhouette_scores.append(silhouette_score(X, km.predict(X)))

In [None]:
px.bar(x=range(2,10), y=silhouette_scores)

In [None]:
km = KMeans(n_clusters=3)
km.fit(X)
df['prediction'] = km.predict(X)

In [None]:
df[['prediction', 'species_id']]

Unnamed: 0,prediction,species_id
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
145,0,3
146,2,3
147,0,3
148,0,3


In [None]:
df[df['prediction'] - df['species_id']]

-0.96

In [None]:
df['species_id'].value_counts()

species_id
1    50
2    50
3    50
Name: count, dtype: int64

In [None]:
df['prediction'] = df['prediction'] + 1

In [None]:
df[df['prediction'] == 1]['species_id'].value_counts()

species_id
3    36
2    11
Name: count, dtype: int64

In [None]:
df[df['prediction'] == 2]['species_id'].value_counts()

species_id
1    50
Name: count, dtype: int64

In [None]:
df[df['prediction'] == 3]['species_id'].value_counts()

species_id
2    39
3    14
Name: count, dtype: int64

In [None]:
def associate_predictions(x):
    if x == 1: 
        return 3
    elif x == 2:
        return 1
    elif x == 3:
        return 2

In [None]:
df['prediction_reassociated'] = df['prediction'].apply(associate_predictions)

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(df['prediction'], df['species_id'], average='micro')

0.09333333333333334

In [None]:
f1_score(df['prediction_reassociated'], df['species_id'], average='micro')

0.8333333333333334

In [None]:
df[((df['prediction_reassociated'] - df['species_id']) != 0) & (df['species_id'] == 2)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id,prediction,prediction_reassociated
50,7.0,3.2,4.7,1.4,versicolor,2,1,3
51,6.4,3.2,4.5,1.5,versicolor,2,1,3
52,6.9,3.1,4.9,1.5,versicolor,2,1,3
56,6.3,3.3,4.7,1.6,versicolor,2,1,3
65,6.7,3.1,4.4,1.4,versicolor,2,1,3
70,5.9,3.2,4.8,1.8,versicolor,2,1,3
75,6.6,3.0,4.4,1.4,versicolor,2,1,3
76,6.8,2.8,4.8,1.4,versicolor,2,1,3
77,6.7,3.0,5.0,1.7,versicolor,2,1,3
85,6.0,3.4,4.5,1.6,versicolor,2,1,3


In [None]:
df[df['species_id'] == 2]['sepal_length'].mean()

5.936

In [None]:
df[df['species_id'] == 3]['sepal_length'].mean()

6.587999999999998

In [None]:
from sklearn.metrics.pairwise import manhattan_distances