# Library

In [None]:
%pip install scikit-learn

import numpy as np
import pandas as pd
import os
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
from sklearn.cluster import KMeans # type: ignore
import plotly.express as px # type: ignore
import seaborn as sns
import plotly.graph_objects as go # type: ignore
from sklearn.preprocessing import MinMaxScaler # type: ignore
from sklearn.metrics import silhouette_score # type: ignore
from sklearn.impute import SimpleImputer # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore

# Read and data representation

In [2]:
penguin = pd.read_csv("penguins.csv")

# penguin = penguin.dropna() #Drop any rows with missing values

X = penguin.iloc[:,:-1].values #Set our training data
y = penguin.iloc[:,-1].values #We'll use this just for visualization as

penguin.head().style.background_gradient(cmap=sns.cubehelix_palette(as_cmap=True))

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,39.1,18.7,181.0,3750.0,MALE
1,39.5,17.4,186.0,3800.0,FEMALE
2,40.3,18.0,195.0,3250.0,FEMALE
3,,,,,
4,36.7,19.3,193.0,3450.0,FEMALE


# EDA and data processing

In [3]:
fig = px.pie(penguin,'sex',color_discrete_sequence=['#491D8B','#7D3AC1','#EB548C','#F1C40F'],title='Data Distribution',template='plotly')
fig.show()

In [4]:
fig = px.box(data_frame=penguin, x = 'sex', y = 'culmen_length_mm', color='sex',
 color_discrete_sequence=['#29066B','#EB548C','#F1C40F', '#F39C12'],title='Box Plot of Culmen Length by Sex',
 orientation='v')
fig.show()


In [5]:
fig = px.histogram(data_frame=penguin, x='culmen_length_mm', color='sex',
                   color_discrete_sequence=['#29066B','#EB548C','#F1C40F', '#F39C12'],
                   nbins = 50,
                   title='Histogram of Culmen Length by Sex')
fig.show()


In [6]:
fig = px.box(data_frame=penguin, x ='sex', y = 'culmen_depth_mm', color = 'sex',
             color_discrete_sequence=['#29066B','#EB548C','#F1C40F', '#F39C12'],
             title='Box Plot of Culmen Depth by Sex',
             orientation='v')
fig.show()


In [7]:
fig = px.scatter(data_frame=penguin, x='culmen_length_mm', y='culmen_depth_mm', color='sex', size = 'culmen_length_mm', template = 'seaborn',
                 color_discrete_sequence=['#29066B','#EB548C','#F1C40F', '#F39C12'],
                 title='Scatter Plot of Culmen Length vs Depth by Sex')
fig.update_layout(width = 800, height = 600, xaxis = dict(color = '#7F8C8D'), yaxis = dict(color = '#7F8C8D'))
fig.show()

In [8]:
fig = px.box(data_frame=penguin, x ='sex', y = 'flipper_length_mm', color = 'sex',
             color_discrete_sequence=['#29066B','#EB548C','#F1C40F', '#F39C12'],
             title='Box Plot of Flipper Length by Sex',
             orientation='v')
fig.show()

In [9]:
fig = px.box(data_frame=penguin, x ='sex', y = 'body_mass_g', color = 'sex',
             color_discrete_sequence=['#29066B','#EB548C','#F1C40F', '#F39C12'],
             title='Box Plot of Body Mass by Sex',
             orientation='v')
fig.show()

# Finding the optimal number of clusters for the K-Means model

In [10]:
imputer = SimpleImputer(strategy='median')   # hoặc 'mean'
scaler  = StandardScaler()

X = imputer.fit_transform(X)
X = scaler.fit_transform(X)

see = []
for i in range (2, 9):
    kmeans = KMeans(n_clusters=i, max_iter = 300)
    kmeans.fit(X)
    see.append(kmeans.inertia_)

fig = px.line(y = see, template = 'seaborn', title = 'Elbow Method')
fig.update_layout(width = 800, height = 600, xaxis = dict(title = 'Number of Clusters', color = '#7F8C8D'), yaxis = dict(title = 'Cluster Inertia', color = '#7F8C8D'))
fig.show()

# Build K-Means model with optimal number of clusters = 4

In [None]:
kmeans = KMeans(n_clusters = 4,
    init = 'k-means++',
    max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(X)

# Visually present the results of the clusters

In [None]:
%pip install scikit-learn

fig = go.Figure()
fig.add_trace(go.Scatter(
    x = X[y_kmeans == 0, 0], y = X[y_kmeans == 0, 1],
    mode = 'markers',  
    marker_color = '#FF5733', name = 'Penguin-Female'
))

fig.add_trace(go.Scatter(
    x = X[y_kmeans == 1, 0], y = X[y_kmeans == 1, 1],
    mode = 'markers',  
    marker_color = '#33FF57', name = 'Penguin-Male'
))

fig.add_trace(go.Scatter(
    x = X[y_kmeans == 2, 0], y = X[y_kmeans == 2, 1],
    mode = 'markers',   
    marker_color = '#3357FF', name = 'Penguin-Unknown'
))

fig.add_trace(go.Scatter(
    x = X[y_kmeans == 3, 0], y = X[y_kmeans == 3, 1],
    mode = 'markers',
    marker_color = '#F39C12', name = 'Penguin-Other'
))
fig.add_trace(go.Scatter(
    x = kmeans.cluster_centers_[:, 0], y = kmeans.cluster_centers_[:, 1],
    mode = 'markers',
    marker_color = '#F1C40F',
    marker_symbol = 4,
    marker_size = 13,
    name = 'Centroids'
))

fig.update_layout(title = 'K-Means Clustering of Penguins', template = 'seaborn', width = 1000, height = 500, xaxis = dict(title = 'Culmen Length (mm)', color = '#7F8C8D'), yaxis = dict(title = 'Culmen Depth (mm)', color = '#7F8C8D'))