# K-Means

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from cluster.KMeans import KMeans

## Import Dataset
Using iris dataset for testing.

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target_names[iris.target]

In [3]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
import plotly.express as px

def plot_iris_clusters(iris, labels):
    # Make a copy of the iris dataframe
    iris_df = iris.copy()

    # Add the predicted labels to the iris dataframe
    iris_df['predicted_label'] = labels
    
    # Map the predicted labels to class names for better visualization
    class_names = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}
    iris_df['predicted_class'] = iris_df['predicted_label'].map(class_names)
    
    # Plot the clustering result using plotly
    fig = px.scatter(iris_df, x='petal length (cm)', y='petal width (cm)', color='predicted_class',
                     title='Clustering Result - Iris Dataset', labels={'petal length (cm)': 'Petal Length',
                                                                        'petal width (cm)': 'Petal Width'})
    fig.show()

In [5]:
# Fit the model
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(iris_df.iloc[:, :-1])

In [6]:
features = iris_df.iloc[:, :-1]
predicted_labels = np.array([kmeans.predict(feature) for feature in features.values])

In [7]:
plot_iris_clusters(iris_df, predicted_labels)

## Iris Dataset Evaluation

In [8]:
# Inertia, lower is better
print("Inertia:", kmeans.inertia)

# Silhouette score, higher is better
print("Silhouette Score:", kmeans.silhouette(iris_df.iloc[:, :-1].values))

Inertia: 78.8556658259773
Silhouette Score: 0.551191604619592


In [9]:
kmeans.silhouette(iris_df.iloc[:, :-1].values)

0.551191604619592

In [10]:
def elbow_method(data, max_clusters=10, verbose=True):
    inertias = []
    silhouettes = []
    for i in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(data)
        inertias.append(kmeans.inertia)
        
        # Calculate the silhouette score only when there are no empty clusters
        if len(set(kmeans.predict(data))) > 1:
            silhouette = kmeans.silhouette(data)
        else:
            silhouette = np.nan
        
        silhouettes.append(silhouette)
        
        if verbose:
            print(f"Clusters: {i}, Inertia: {kmeans.inertia}, Silhouette Score: {silhouette}")
    return inertias, silhouettes


In [11]:
inertias, silhouettes = elbow_method(features.values)

Clusters: 2, Inertia: 152.34795176035792, Silhouette Score: 0.6810461692117462
Clusters: 3, Inertia: 78.85144142614601, Silhouette Score: 0.5528190123564095
Clusters: 4, Inertia: 57.25600931571816, Silhouette Score: 0.4974551890173751
Clusters: 5, Inertia: 46.47223015873016, Silhouette Score: 0.4930804067193521
Clusters: 6, Inertia: 47.80062335080588, Silhouette Score: 0.34187325506759103
Clusters: 7, Inertia: 34.62008318478907, Silhouette Score: 0.3494067645448536
Clusters: 8, Inertia: 32.95306240490788, Silhouette Score: 0.34926167135933356
Clusters: 9, Inertia: 34.71368243598506, Silhouette Score: 0.28721335752296095
Clusters: 10, Inertia: 30.917367593191123, Silhouette Score: 0.23536008178407855


In [12]:
# Plot the inertia and silhouette values
import plotly.graph_objects as go

def plot_inertia_and_silhouette(inertias, silhouettes):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(2, len(inertias) + 2)), y=inertias, mode='lines+markers', name='Inertia'))
    fig.add_trace(go.Scatter(x=list(range(2, len(silhouettes) + 2)), y=silhouettes, mode='lines+markers', name='Silhouette Score'))
    fig.update_layout(title='Inertia and Silhouette Score vs. Number of Clusters', xaxis_title='Number of Clusters', yaxis_title='Inertia / Silhouette Score')
    fig.show()

In [13]:
plot_inertia_and_silhouette(inertias, silhouettes)

## External Dataset Evaluation
`UNCOMMENT TO TRY YOUR OWN DATA`

In [14]:
# DATA = "DATA/PATH/HERE.csv"
# N_CLUSTERS = 3

In [15]:
# df = pd.read_csv(DATA)
# df.head()

In [16]:
# # Use the elbow method to find the best number of clusters
# inertias, silhouettes = elbow_method(df.values)

# # Plot the inertia and silhouette values
# plot_inertia_and_silhouette(inertias, silhouettes)

In [17]:
# kmeans = KMeans(n_clusters=N_CLUSTERS)
# kmeans.fit(df)

In [18]:
# # Print the inertia and silhouette score
# print("Inertia:", kmeans.inertia)
# print("Silhouette Score:", kmeans.silhouette(df.values))