In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
data = pd.read_csv('./Iris.csv', index_col='Id')

data.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
px.scatter(data, x='SepalLengthCm', y='SepalWidthCm', color='Species')

In [4]:
features = data.drop(columns=['Species'])

In [5]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [6]:
inertia = []
K_range = range(1, 11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(scaled_features)
    inertia.append(km.inertia_)

fig_score = px.line(
    x=list(K_range),
    y=inertia,
    markers=True,
    labels={'x': 'Amount of clusters (k)', 'y': 'Average elbow coef'},
    title='Find optimal K with Elbow method '
)
fig_score.show()

In [7]:
silhouette_scores = []
k_range = range(2, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_features)
    score = silhouette_score(scaled_features, labels)
    silhouette_scores.append(score)

fig_score = px.line(
    x=list(k_range),
    y=silhouette_scores,
    markers=True,
    labels={'x': 'Amount of clusters (k)', 'y': 'Average silhouette coef'},
    title='Find optimal K with Silhouette method '
)
fig_score.show()

In [8]:
optimal_k = k_range[np.argmax(silhouette_scores)]
print("Оптимальне значення k: ", optimal_k)

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
data['cluster'] = kmeans.fit_predict(scaled_features)

Оптимальне значення k:  2


In [9]:
px.scatter(data, x='SepalLengthCm', y='SepalWidthCm', color='cluster', title='Clusters by KMeans')

In [10]:
px.scatter(data, x='SepalLengthCm', y='SepalWidthCm', color='Species', title='Original Species')