# Penguin Random Forest Classification

## Library Import

In [62]:
import pandas as pd
import numpy as np
import seaborn as sns

## Loading the Data set

In [63]:
data = sns.load_dataset('penguins')
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [64]:
data.shape

(344, 7)

In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [66]:
data.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

## Dropping the null values

In [67]:
data.dropna(inplace=True)

In [68]:
data.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

## Feature Engineering

### Transforming categorical data into numeric

In [69]:
data.sex.unique()

array(['Male', 'Female'], dtype=object)

In [70]:
pd.get_dummies(data['sex'],dtype=int).head()

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


In [71]:
sex = pd.get_dummies(data['sex'],dtype=int, drop_first=True)
sex.head()

Unnamed: 0,Male
0,1
1,0
2,0
4,0
5,1


### One-hot encoding to island

In [72]:
data.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [73]:
pd.get_dummies(data['island']).head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,False,False,True
1,False,False,True
2,False,False,True
4,False,False,True
5,False,False,True


In [74]:
island = pd.get_dummies(data['island'], drop_first=True,dtype=int)
island.head()

Unnamed: 0,Dream,Torgersen
0,0,1
1,0,1
2,0,1
4,0,1
5,0,1


In [75]:
data = pd.concat([data,island,sex],axis=1)
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen,Male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,1,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,1,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,1,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0,1,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,0,1,1


In [76]:
data.drop(['sex','island'], axis=1,inplace=True)

In [77]:
data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,Adelie,39.1,18.7,181.0,3750.0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,1,1


## Setup target varibles

In [78]:
Y = data.species
Y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [79]:
Y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [80]:
Y = Y.map({'Adelie':0,'Chinstrap':1,'Gentoo':2})

In [81]:
Y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [82]:
data.drop('species',inplace=True,axis=1)

In [83]:
data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,39.1,18.7,181.0,3750.0,0,1,1
1,39.5,17.4,186.0,3800.0,0,1,0
2,40.3,18.0,195.0,3250.0,0,1,0
4,36.7,19.3,193.0,3450.0,0,1,0
5,39.3,20.6,190.0,3650.0,0,1,1


In [84]:
X = data

## Splitting the dataa into Train and Test

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3,random_state=0)

In [86]:
X_train.shape

(233, 7)

In [87]:
X_test.shape

(100, 7)

In [88]:
y_train.head

<bound method NDFrame.head of 165    1
156    1
112    0
220    2
96     0
      ..
332    2
198    1
123    0
53     0
178    1
Name: species, Length: 233, dtype: int64>

## Training Random Forest 

In [89]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=5,criterion='entropy',random_state=0)
classifier.fit(X_train, y_train)

### Predicting the test results

In [91]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 2, 0, 0, 0, 1, 2, 2, 1, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 2, 2, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 0,
       2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 0, 2, 0, 2, 1, 2, 2, 2, 1, 2, 1, 0,
       0, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2], dtype=int64)

### Confusion Matrix

In [93]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,accuracy_score

In [94]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[48,  0,  0],
       [ 2, 14,  0],
       [ 0,  0, 36]], dtype=int64)

In [95]:
accuracy_score(y_test,y_pred)

0.98

In [96]:
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.96      1.00      0.98        48\n           1       1.00      0.88      0.93        16\n           2       1.00      1.00      1.00        36\n\n    accuracy                           0.98       100\n   macro avg       0.99      0.96      0.97       100\nweighted avg       0.98      0.98      0.98       100\n'