## Import Necessary Libraries

In [46]:
# Data Handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing & Feature Selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.compose import ColumnTransformer, make_column_selector

# Machine Learning Models (Scikit-Learn)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, classification_report

# PyCaret for AutoML
from pycaret.classification import *
from pycaret.regression import *

# Warnings
import warnings
warnings.filterwarnings("ignore")


## Load Dataset

In [2]:
df = pd.read_csv('./penguins_cleaning.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
4,5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [47]:
# Remove the first column 'Unnamed:0' from the data 
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

In [10]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


## Perform EDA 

In [12]:
df.isna().sum()[df.isna().sum() >0]

Series([], dtype: int64)

In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bill_length_mm,333.0,43.992793,5.468668,32.1,39.5,44.5,48.6,59.6
bill_depth_mm,333.0,17.164865,1.969235,13.1,15.6,17.3,18.7,21.5
flipper_length_mm,333.0,200.966967,14.015765,172.0,190.0,197.0,213.0,231.0
body_mass_g,333.0,4207.057057,805.215802,2700.0,3550.0,4050.0,4775.0,6300.0


## Prepare the data Modeling

In [14]:
X = df.drop(columns=['species'])
y = df['species']

In [15]:
X.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Torgersen,36.7,19.3,193.0,3450.0,Female
4,Torgersen,39.3,20.6,190.0,3650.0,Male


## Split the data into train and test sets

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
X_train.shape, X_test.shape

((233, 6), (100, 6))

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 233 entries, 22 to 102
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   island             233 non-null    object 
 1   bill_length_mm     233 non-null    float64
 2   bill_depth_mm      233 non-null    float64
 3   flipper_length_mm  233 non-null    float64
 4   body_mass_g        233 non-null    float64
 5   sex                233 non-null    object 
dtypes: float64(4), object(2)
memory usage: 12.7+ KB


## Define Column Transformer (i.e On-Hot Encoding)

In [26]:
# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']),
        ('cat', OneHotEncoder(drop='first'), ['island', 'sex'])
    ])

In [27]:
preprocessor

In [51]:
# Define the column transformer
preprocessor_minmax = ColumnTransformer(
    transformers=[
        ('num_minmax', MinMaxScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat_onehot', OneHotEncoder(drop='first'), make_column_selector(dtype_include=['object', 'category']))
    ])

In [52]:
preprocessor_minmax

## Create the Pipeline using the preprocessor and the chosen Classification Model

In [62]:
# Create a pipeline standard scaler, and logistic regression
# Create a pipeline with the preprocessor and logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('mdl', KNeighborsClassifier())
])

In [63]:
pipeline

In [64]:
# Create a pipeline standard scaler, and logistic regression
# Create a pipeline with the preprocessor and logistic regression
pipeline_mm = Pipeline([
    ('preprocessor_minmax', preprocessor_minmax),
    ('mdl', KNeighborsClassifier())
])

In [65]:
pipeline_mm

## Train the Model

In [58]:
# Train the model
pipeline.fit(X_train, y_train)

In [38]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [50]:
# Print Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        48
   Chinstrap       1.00      0.94      0.97        18
      Gentoo       1.00      1.00      1.00        34

    accuracy                           0.99       100
   macro avg       0.99      0.98      0.99       100
weighted avg       0.99      0.99      0.99       100



In [59]:
# Train the model
pipeline_mm.fit(X_train, y_train)

In [60]:
# Make predictions
y_pred = pipeline_mm.predict(X_test)

In [61]:
# Print Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        48
   Chinstrap       1.00      0.94      0.97        18
      Gentoo       1.00      1.00      1.00        34

    accuracy                           0.99       100
   macro avg       0.99      0.98      0.99       100
weighted avg       0.99      0.99      0.99       100

