In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("penguins_size.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


## Data preprocessing

In [4]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [5]:
#dropping the missing values
df=df.dropna()

In [7]:
df=df[df['sex']!='.']
df.shape

(333, 7)

In [8]:
#X,y

In [9]:
X=pd.get_dummies(df.drop('species',axis=1),drop_first=True)
y=df['species']

In [10]:
#Train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=42)

## Modelling & evalutation

In [11]:
#Random forest classifier - with default parameters
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=0)
model.fit(X_train,y_train)

#Prediction
ypred_train=model.predict(X_train)
ypred_test=model.predict(X_test)

#Evaluation
#1.Train & test accuracy
from sklearn.metrics import accuracy_score
print("Train accuracy",accuracy_score(ypred_train,y_train))
print("Test accuracy",accuracy_score(ypred_test,y_test))

#2.Cross Validation Score
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model,X,y,cv=5)
print("cross validation score:",scores.mean())

Train accuracy 1.0
Test accuracy 0.98
cross validation score: 0.9849841700587969


### Importance of each feature given by this model

In [12]:
model.feature_importances_

array([0.32389535, 0.21880749, 0.24194752, 0.08391905, 0.10207013,
       0.02255358, 0.00680688])

In [13]:
pd.DataFrame(index=X.columns,data=model.feature_importances_,columns=['Feature Importance'])

Unnamed: 0,Feature Importance
culmen_length_mm,0.323895
culmen_depth_mm,0.218807
flipper_length_mm,0.241948
body_mass_g,0.083919
island_Dream,0.10207
island_Torgersen,0.022554
sex_MALE,0.006807


### Hyper parameter tuning

In [14]:
from sklearn.model_selection import GridSearchCV

estimator=RandomForestClassifier(random_state=0)

param_grid={'n_estimators':list(range(1,101))}

grid=GridSearchCV(estimator,param_grid,scoring='accuracy',cv=5)
grid.fit(X_train,y_train)
grid.best_params_

{'n_estimators': 6}

## Random forest model with best hyper parameter

In [15]:
model=RandomForestClassifier(n_estimators=8, random_state=0)
model.fit(X_train,y_train)

ypred_train=model.predict(X_train)
ypred_test=model.predict(X_test)

print("Train_accuracy:",accuracy_score(ypred_train,y_train))
print("Test_accuracy:",accuracy_score(ypred_test,y_test))

scores=cross_val_score(model,X,y,cv=5)
print("Cross validation score:", scores.mean())

Train_accuracy: 1.0
Test_accuracy: 0.98
Cross validation score: 0.9819990954319312
