In [1]:
# Import Required Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading and examining the dataset
penguins_df = pd.read_csv("penguins.csv")

In [3]:
# Remove non-numeric colomns (sex , index)
numeric_columns = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
df_numeric = penguins_df[numeric_columns].copy()
df_clean = df_numeric.dropna()

In [4]:
#Feature scalling 
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_clean)
df_scaled = pd.DataFrame(df_scaled, columns=numeric_columns, index=df_clean.index)


In [5]:
#Finding optimal k clusters by elbo method
k_range = range(2,10)
inertias = []

for k in k_range:
    kmeans_temp = KMeans(n_clusters=k , random_state=42 , n_init=10)
    kmeans_temp.fit(df_scaled)
    inertias.append(kmeans_temp.inertia_)

inertia_diff = np.diff(inertias)  # First differences
inertia_diff2 = np.diff(inertia_diff)  # Second differences (curvature)
elbow_index = np.argmax(inertia_diff2) + 2  # +2 because we lost 2 points in double diff
optimal_k = k_range[elbow_index] 

In [6]:
#Final clustring
kmeans_final = KMeans(n_clusters=optimal_k , random_state=42 , n_init=10)
cluster_labels = kmeans_final.fit_predict(df_scaled)

# Add cluster labels to the original clean dataframe
df_clean_with_clusters = df_clean.copy()
df_clean_with_clusters['cluster'] = cluster_labels

In [7]:
#stat_penguin dataframe
stat_penguins = df_clean_with_clusters.groupby('cluster')[numeric_columns].mean()

print(stat_penguins)

         culmen_length_mm  culmen_depth_mm  flipper_length_mm  body_mass_g
cluster                                                                   
0               49.792982        15.738596         221.912281  5519.736842
1               38.330233        18.101550         188.651163  3590.697674
2               45.522581        14.314516         212.935484  4699.596774
3               47.735714        18.751190         196.869048  3897.023810
