In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


Set the random seed for reproducibility (and the meaning of life).


In [None]:
np.random.seed(42)


## Load the Dataset


In [None]:
penguins_df = sns.load_dataset("penguins")


In [None]:
penguins_df.head()


In [None]:
penguins_df.info()


Check for missing values.


In [None]:
penguins_df.isnull().sum()


There are some missing values. For this clustering task, we can simply drop rows with missing values.


In [None]:
penguins_df = penguins_df.dropna()


In [None]:
len(penguins_df)


## Prepare the Data for Clustering


We can use the numeric features for clustering and save the species labels, so that we may compare them to our clusters later.


In [None]:
# Save the species labels for comparison later
species_labels = penguins_df['species']

# Select only the numeric features for clustering
X = penguins_df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]


In [None]:
X.head()


In [None]:
X.describe()


## Apply K-Means Clustering


We know that there are 3 penguin species, so we will start by creating 3 clusters.


In [None]:
from sklearn.cluster import KMeans


In [None]:
NUMBER_OF_CLUSTERS = 3

kmeans = KMeans(n_clusters=NUMBER_OF_CLUSTERS, n_init='auto', random_state=42)
cluster_labels = kmeans.fit_predict(X)


In [None]:
cluster_labels


## Visualise the Clusters


We can visualise the clusters using a pairplot to see how well they separate.


In [None]:
# Add cluster labels to the dataframe for visualisation
plot_df = X.copy()
plot_df['cluster'] = cluster_labels

sns.pairplot(plot_df, hue='cluster', palette='Set2')
plt.show()


## Compare Clusters to Actual Species


We can create a cross-tabulation to see how our clusters correspond to the actual species.


In [None]:
comparison_df = pd.DataFrame({
    'species': species_labels,
    'cluster': cluster_labels
})

pd.crosstab(comparison_df['species'], comparison_df['cluster'])


We can use the adjusted rand index to measure how closely our clusters match the actual species. A score of 1 means perfect agreement, and 0 means random clustering.


In [None]:
from sklearn.metrics import adjusted_rand_score


In [None]:
ari = adjusted_rand_score(species_labels, cluster_labels)
print(f"Adjusted Rand Index: {ari:.4f}")


## Try with Normalised Features


The features have different scales (e.g. body_mass_g is in the thousands, whilst bill measurements are under 100). Here we can normalise the data and see if that improves the clustering.


In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_scaled


Now apply k-means to the scaled data.


In [None]:
kmeans_scaled = KMeans(n_clusters=NUMBER_OF_CLUSTERS, n_init='auto', random_state=42)
cluster_labels_scaled = kmeans_scaled.fit_predict(X_scaled)


In [None]:
# Create comparison
comparison_df_scaled = pd.DataFrame({
    'species': species_labels,
    'cluster': cluster_labels_scaled
})

pd.crosstab(comparison_df_scaled['species'], comparison_df_scaled['cluster'])


In [None]:
ari_scaled = adjusted_rand_score(species_labels, cluster_labels_scaled)
print(f"Adjusted Rand Index (scaled): {ari_scaled:.4f}")


We can now visualise the scaled clustering results.


In [None]:
# Add cluster labels to the original dataframe for visualisation
plot_df_scaled = X.copy()
plot_df_scaled['cluster'] = cluster_labels_scaled

sns.pairplot(plot_df_scaled, hue='cluster', palette='Set2')
plt.show()


## Compare Results


In [None]:
print("Clustering Results Comparison:")
print(f"Without scaling - ARI: {ari:.4f}")
print(f"With scaling - ARI: {ari_scaled:.4f}")


The normalised features give us a better clustering result. This makes sense because k-means uses distance calculations, and features with larger values (i.e. body_mass_g) would dominate the distance calculation without normalisation.


## Visualise Actual Species vs Clusters


Here we can create side-by-side plots to compare the actual species to our best clustering result.


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot actual species
sns.scatterplot(data=penguins_df, 
                x='flipper_length_mm', 
                y='body_mass_g',
                hue='species',
                ax=axes[0],
                palette='Set1')
axes[0].set_title('Actual Species')

# Plot predicted clusters
plot_comparison = penguins_df.copy()
plot_comparison['cluster'] = cluster_labels_scaled
sns.scatterplot(data=plot_comparison,
                x='flipper_length_mm',
                y='body_mass_g',
                hue='cluster',
                ax=axes[1],
                palette='Set2')
axes[1].set_title('Predicted Clusters (Scaled Features)')

plt.tight_layout()
plt.show()
