In [None]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

penguins = pd.read_csv("penguins.csv")
penguins_dummies = pd.get_dummies(penguins["sex"], drop_first=True)

penguins_dummies = pd.concat([penguins, penguins_dummies], axis=1)
penguins_dummies = penguins_dummies.drop("sex", axis=1)

scaler = StandardScaler()
penguins_scaled = pd.DataFrame(scaler.fit_transform(penguins_dummies),
                               columns=penguins_dummies.columns)

# Declaring variables for use (elbow method)
inertia = []

num_clusters = range(1, 10)

# Populating inertia for various clusters
for i in num_clusters:
    kmeans = KMeans(n_clusters=i, random_state=42).fit(penguins_scaled)
    inertia.append(kmeans.inertia_)

# Plotting elbow plot data
plt.plot(range(1,10), inertia, marker='o')
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

n_clusters=4

# Run the k-means clustering algorithm with the optimal number of clusters

kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(penguins_scaled)
penguins_dummies["label"] = kmeans.labels_

#visualize the clusters ('Male' column)
plt.scatter(penguins_dummies["label"], penguins_dummies.iloc[:, -2], c=kmeans.labels_, cmap='viridis')
plt.xlabel('Cluster')
plt.ylabel('Male')
plt.xticks(range(penguins_dummies['label'].min(), int(penguins_dummies['label'].max()) + 1))
plt.title(f'K-means Clustering (K={n_clusters})')
plt.show()
          
# Create final 'stat_penguins' DataFrame
numeric_columns = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'label']
stat_penguins = penguins_dummies[numeric_columns].groupby('label').mean()
stat_penguins