# Penguins Dataset

In [65]:
import pandas as pd
import numpy as np
import plotly.express as px
import nbformat

In [66]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("larsen0966/penguins")

print("Path to dataset files:", path)

import os
os.listdir(path)

Path to dataset files: /Users/faisalbalkhair/.cache/kagglehub/datasets/larsen0966/penguins/versions/1


['penguins.csv']

In [67]:
penguins_path = path + '/penguins.csv'
df = pd.read_csv(penguins_path)

df.head(25)

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,6,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
6,7,Adelie,Torgersen,38.9,17.8,181.0,3625.0,female,2007
7,8,Adelie,Torgersen,39.2,19.6,195.0,4675.0,male,2007
8,9,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,2007
9,10,Adelie,Torgersen,42.0,20.2,190.0,4250.0,,2007


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 24.3+ KB


In [69]:
df.isnull().sum()

Unnamed: 0            0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

### The Dataset contains of 333 penguin sample after drop na

In [70]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         333 non-null    int64  
 1   species            333 non-null    object 
 2   island             333 non-null    object 
 3   bill_length_mm     333 non-null    float64
 4   bill_depth_mm      333 non-null    float64
 5   flipper_length_mm  333 non-null    float64
 6   body_mass_g        333 non-null    float64
 7   sex                333 non-null    object 
 8   year               333 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 26.0+ KB


In [71]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])
df['island'] = le.fit_transform(df['island'])
df['sex'] = le.fit_transform(df['sex'])

df.count()

Unnamed: 0           333
species              333
island               333
bill_length_mm       333
bill_depth_mm        333
flipper_length_mm    333
body_mass_g          333
sex                  333
year                 333
dtype: int64

## COPY - PASTE

In [72]:
# Visualize feature correlations using a heatmap
# This helps identify which features are strongly related to each other
# and to the target variable (species)
import plotly.express as px
import plotly.graph_objects as go

# Calculate correlation matrix
corr_matrix = df.corr()

# Create an interactive heatmap with plotly
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu_r',  # Similar to 'coolwarm'
    zmin=-1, zmax=1,
    text=corr_matrix.round(2).values,
    texttemplate='%{text}',
    textfont={'size': 12},
    hoverongaps=False
))

fig.update_layout(
    title='Feature Correlation Heatmap',
    width=700,
    height=600
)
fig.show()

In [85]:
X = df.drop(['species','Unnamed: 0'], axis=1)
y = df['species']
X

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,2,39.1,18.7,181.0,3750.0,1,2007
1,2,39.5,17.4,186.0,3800.0,0,2007
2,2,40.3,18.0,195.0,3250.0,0,2007
4,2,36.7,19.3,193.0,3450.0,0,2007
5,2,39.3,20.6,190.0,3650.0,1,2007
...,...,...,...,...,...,...,...
339,1,55.8,19.8,207.0,4000.0,1,2009
340,1,43.5,18.1,202.0,3400.0,0,2009
341,1,49.6,18.2,193.0,3775.0,1,2009
342,1,50.8,19.0,210.0,4100.0,1,2009


## Split the data into Train (80%) and Test (20%)

In [86]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, shuffle=True)

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

logistic_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(random_state=42)
svc_model = SVC()

In [88]:
from sklearn.model_selection import cross_val_score

models = {
    "Logistic Regression": logistic_model,
    "Random Forest": rf_model,
    "SVM": svc_model
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(name, scores.mean())


lbfgs failed to converge after 1000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge after 1000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge after 1000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS RE

Logistic Regression 0.9886792452830189
Random Forest 0.9849755415793151
SVM 0.7366876310272537


In [89]:
rf_model.fit(X_train,y_train)
score = rf_model.score(X_test,y_test)
score

0.9701492537313433

In [90]:
logistic_model.fit(X_train,y_train)
score = logistic_model.score(X_test,y_test)
score


lbfgs failed to converge after 1000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



1.0

In [91]:
svc_model.fit(X_train,y_train)
score = svc_model.score(X_test,y_test)
score

0.6716417910447762

# Evaluate each model using confusion matrices
- A confusion matrix shows how well the model classifies each species:
- Diagonal values = correct predictions
- Off-diagonal values = misclassifications

In [92]:
# Evaluate each model using confusion matrices
# A confusion matrix shows how well the model classifies each species:
# - Diagonal values = correct predictions
# - Off-diagonal values = misclassifications
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
import numpy as np

def evaluate_model(model, name):
    """Generate and display a confusion matrix heatmap for a given model."""
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    # Create labels for the confusion matrix
    labels = ['Adelie', 'Chinstrap', 'Gentoo']
    
    # Create annotated heatmap using plotly
    fig = ff.create_annotated_heatmap(
        z=cm,
        x=labels,
        y=labels,
        colorscale='Blues',
        showscale=True
    )
    
    fig.update_layout(
        title=f'{name} Confusion Matrix',
        xaxis_title='Predicted',
        yaxis_title='Actual',
        width=500,
        height=400
    )
    # Reverse y-axis to match sklearn convention
    fig.update_yaxes(autorange='reversed')
    fig.show()
    
# Evaluate all three models
evaluate_model(logistic_model, "Logistic Regression")
evaluate_model(rf_model, "Random Forest")
evaluate_model(svc_model, "SVM")

# Compare model performance using multiple metrics
- Accuracy: overall correctness (correct predictions / total predictions)
- Precision: of predicted positives, how many are actually positive
- Recall: of actual positives, how many were correctly predicted
- F1 Score: harmonic mean of precision and recall (balanced metric)

In [93]:

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_metrics(model, name):
    """Calculate classification metrics for a given model."""
    y_pred = model.predict(X_test)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1 Score": f1_score(y_test, y_pred, average='weighted')
    }

# Collect metrics for all models
metrics = [
    get_metrics(logistic_model, "Logistic Regression"),
    get_metrics(rf_model, "Random Forest"),
    get_metrics(svc_model, "SVM")
]

# Create a summary DataFrame for easy comparison
df_metrics = pd.DataFrame(metrics)
df_metrics.set_index("Model", inplace=True)
df_metrics.round(4)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,1.0,1.0,1.0,1.0
Random Forest,0.9701,0.972,0.9701,0.9697
SVM,0.6716,0.4974,0.6716,0.5705


In [94]:
# Visualize model comparison with an interactive bar chart
# This makes it easy to compare how each model performs across all metrics
import plotly.express as px

# Reset index to use Model as a column for plotting
df_plot = df_metrics.reset_index().melt(id_vars='Model', var_name='Metric', value_name='Score')

fig = px.bar(
    df_plot, 
    x='Metric', 
    y='Score', 
    color='Model',
    barmode='group',
    title='Model Performance Comparison',
    text='Score'
)
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(yaxis_range=[0, 1.1])
fig.show()

## Final observation:
- The dataset contains 333 samples and 7 features
- Logistic regression was better than Random Forest and svc
- The model achieved 100% on the test data