### Palmer penguins dataset evaluation

In [3]:
#importing libraries

import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
data=pd.read_csv("/Users/brianshimmer/Desktop/Data science training/Data science training/Data science training/Palmer penguins dataset/penguins.csv")

In [5]:
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [6]:
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [7]:
data.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,342.0,342.0,342.0,342.0,344.0
mean,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,5.459584,1.974793,14.061714,801.954536,0.818356
min,32.1,13.1,172.0,2700.0,2007.0
25%,39.225,15.6,190.0,3550.0,2007.0
50%,44.45,17.3,197.0,4050.0,2008.0
75%,48.5,18.7,213.0,4750.0,2009.0
max,59.6,21.5,231.0,6300.0,2009.0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


In [9]:
data['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

In [10]:
data.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

### Data preprocessing

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler

In [13]:
#filling null numerical features with median values

num_features=['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']
num_imputer=SimpleImputer(strategy='median')
data[num_features]=num_imputer.fit_transform(data[num_features])

In [14]:
#fill categorical features with simple imputer

cat_features=['sex']
cat_imputer=SimpleImputer(strategy='most_frequent')
data[cat_features]=cat_imputer.fit_transform(data[cat_features])

In [15]:
#One hot encoding for sex and island

data=pd.get_dummies(data, columns=['sex','island'], drop_first=True)

In [16]:
#Separating the data to features and target variable

X=data.drop(columns=['species','year'])
y=data['species']

In [17]:
print("Features after preprocessing:")
print(X.head())
print("Target variable")
print(y.head())

Features after preprocessing:
   bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex_male  \
0           39.10           18.7              181.0       3750.0      True   
1           39.50           17.4              186.0       3800.0     False   
2           40.30           18.0              195.0       3250.0     False   
3           44.45           17.3              197.0       4050.0      True   
4           36.70           19.3              193.0       3450.0     False   

   island_Dream  island_Torgersen  
0         False              True  
1         False              True  
2         False              True  
3         False              True  
4         False              True  
Target variable
0    Adelie
1    Adelie
2    Adelie
3    Adelie
4    Adelie
Name: species, dtype: object


### splitting data and feature scaling

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
scalers = {
    "no_scaling": None,  # keep as is
    "standard": StandardScaler(),
    "minmax": MinMaxScaler(),
    "robust": RobustScaler()
}

X_scaled_versions = {}

for name, scaler in scalers.items():
    if scaler is None:
        X_scaled_versions[name] = X_train.copy(), X_test.copy()
    else:
        X_train_scaled = X_train.copy()
        X_test_scaled = X_test.copy()
        X_train_scaled[num_features] = scaler.fit_transform(X_train[num_features])
        X_test_scaled[num_features] = scaler.transform(X_test[num_features])
        X_scaled_versions[name] = X_train_scaled, X_test_scaled

### Training and evaluating the model

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score

models={
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "KNN":KNeighborsClassifier(),
    "SVM":SVC(kernel='rbf', random_state=42),
    "Decision tree":DecisionTreeClassifier(random_state=42),
    "Random forest":RandomForestClassifier(random_state=42)
}

results={}

for name, model in models.items():
    if name in ['Decision tree','Random forest']:
        X_train_model, X_test_model=X_scaled_versions['no_scaling']
    else:
        X_train_model, X_test_model=X_scaled_versions['standard']

   #training and predicting the model

    model.fit(X_train_model,y_train)
    y_pred=model.predict(X_test_model)

    #Evaluating the model

    acc=accuracy_score(y_test, y_pred)
    report=classification_report(y_test, y_pred, output_dict=True)
    recall=recall_score(y_test, y_pred, average="weighted")
    f1=f1_score(y_test, y_pred, average="weighted")
    cm=confusion_matrix(y_test, y_pred)

    #Storing results

    results[name]={
        "Accuracy":acc,
        "Classification report":report,
        "Recall score":recall,
        "F1 score":f1,
        "Confusion matrix":cm
    }

#print accuracies

for name, metrics in results.items():
    print(f"{name} Accuracy: {metrics['Accuracy']:.4f}\n")
    print(f"{name} Classification report: {metrics['Classification report']:}\n")
    print(f"{name} Recall score:{metrics['Recall score']:.4f}")
    print(f"{name} f1 score:{metrics['F1 score']:.4f}")
    print(f"{name} confusion matrix:{metrics['Confusion matrix']:}\n\n")



Logistic Regression Accuracy: 1.0000

Logistic Regression Classification report: {'Adelie': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 30.0}, 'Chinstrap': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 14.0}, 'Gentoo': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 25.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 69.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 69.0}}

Logistic Regression Recall score:1.0000
Logistic Regression f1 score:1.0000
Logistic Regression confusion matrix:[[30  0  0]
 [ 0 14  0]
 [ 0  0 25]]


KNN Accuracy: 0.9855

KNN Classification report: {'Adelie': {'precision': 1.0, 'recall': 0.9666666666666667, 'f1-score': 0.9830508474576272, 'support': 30.0}, 'Chinstrap': {'precision': 0.9333333333333333, 'recall': 1.0, 'f1-score': 0.9655172413793104, 'support': 14.0}, 'Gentoo': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support'

In [69]:
# Feature importance for tree based models

import pandas as pd

features=X_train.columns

important_features_dt=models["Decision tree"].feature_importances_
important_features_rf=models["Random forest"].feature_importances_

dt_df=pd.DataFrame({"features":features,"Importance":important_features_dt}).sort_values(by="Importance", ascending=False)
rf_df=pd.DataFrame({"features":features,"Importance":important_features_rf}).sort_values(by="Importance", ascending=False)

print("Decision tree feature importance:\n",dt_df)
print("\n")
print("Random forest classifier feature importance:\n",rf_df)

Decision tree feature importance:
             features  Importance
2  flipper_length_mm    0.517310
0     bill_length_mm    0.334621
5       island_Dream    0.089844
4           sex_male    0.028583
1      bill_depth_mm    0.023915
6   island_Torgersen    0.005726
3        body_mass_g    0.000000


Random forest classifier feature importance:
             features  Importance
0     bill_length_mm    0.350240
1      bill_depth_mm    0.212423
2  flipper_length_mm    0.195243
3        body_mass_g    0.111465
5       island_Dream    0.096848
6   island_Torgersen    0.026634
4           sex_male    0.007148
