<h1 style="color:red; text-align:center;font-weight:bold;">Building & Training the Penguins Classifier Model</h1>
<h3 style="color:green;text-align:center;font-weight:bold;">Balav Sha</h3>

Required Libraires

In [28]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

Importing and Reading the Dataset

In [19]:
# Import Penguins Data to get familiar
penguins_df = pd.read_csv("../Dataset/penguins.csv")
penguins_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


Basic Data Exploration

In [20]:
# Check for missing values in each column
penguins_df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [21]:
# Drop all the missing/Null values
penguins_df.dropna(inplace=True)

# Check again for missing values
penguins_df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

In [22]:
# Columns in the Penguins Dataframe
penguins_df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='object')

In [23]:
# Define features and Target variables
features = penguins_df[['island', 'bill_length_mm', 'bill_depth_mm',
                        'flipper_length_mm', 'body_mass_g', 'sex']]

output = penguins_df['species']

# check features
features.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,male
1,Torgersen,39.5,17.4,186.0,3800.0,female
2,Torgersen,40.3,18.0,195.0,3250.0,female
4,Torgersen,36.7,19.3,193.0,3450.0,female
5,Torgersen,39.3,20.6,190.0,3650.0,male


In [24]:
# Check for ouput variable
output.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [25]:
# One-Hot Encoding to convert categorical variable to dummy/numerical variable
features = pd.get_dummies(features)
features.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,40.3,18.0,195.0,3250.0,0,0,1,1,0
4,36.7,19.3,193.0,3450.0,0,0,1,1,0
5,39.3,20.6,190.0,3650.0,0,0,1,0,1


In [26]:
# display unique values in target variable
output, uniques = pd.factorize(output)
uniques

Index(['Adelie', 'Gentoo', 'Chinstrap'], dtype='object')

Model Training --> **RandomForestClassifier** Model

In [27]:
# Splitting Dataset into train set and test set
x_train, x_test, y_train, y_test = train_test_split(features, output, test_size=0.8)

# Instantiate the RandomForestClassifier model
rfc = RandomForestClassifier(random_state=15)

# Fit the model with traing set
rfc.fit(x_train, y_train)

# Make a prediction with fitted model
y_pred =  rfc.predict(x_test)

# Measure the accuracy of model by comparing predicted outcome with acutal outcome
score = accuracy_score(y_pred, y_test)

# Display the accuracy of RandomForestClassifier model
print("Model Accuracy --> {}".format(score))

Model Accuracy --> 0.9625468164794008


Save the Trained Model for the Future use

In [31]:
# File -->"random_forest_penguin.pickle" contains our Model
rf_pickle = open("random_forest_penguin.pickle", "wb") # "wb" -> write bytes
# write python files to specified file
pickle.dump(rfc, rf_pickle)
# close the file
rf_pickle.close()

In [32]:
# File-->"output_penguin.pickle" has mapping b/w penguin species and output of our model
output_pickle = open("output_penguin.pickle", "wb") # "wb" -> write bytes
# write python file to specified file
pickle.dump(uniques, output_pickle)
# close the file
output_pickle.close()