# Building a Classification Model

* Load data from csv
* Build a Random Forest Classifier
* Pickle the model

In [1]:
import pandas as pd

In [2]:
penguins = pd.read_csv('penguins_cleaned.csv')
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,male
1,Adelie,Torgersen,39.5,17.4,186,3800,female
2,Adelie,Torgersen,40.3,18.0,195,3250,female
3,Adelie,Torgersen,36.7,19.3,193,3450,female
4,Adelie,Torgersen,39.3,20.6,190,3650,male
...,...,...,...,...,...,...,...
328,Chinstrap,Dream,55.8,19.8,207,4000,male
329,Chinstrap,Dream,43.5,18.1,202,3400,female
330,Chinstrap,Dream,49.6,18.2,193,3775,male
331,Chinstrap,Dream,50.8,19.0,210,4100,male


In [3]:
# Encode sex and island into new columns

df = penguins.copy()
target = 'species' # to predict
encode = ['sex', 'island']

for col in encode:
    dummy = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df,dummy], axis=1)
    del df[col]

df

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181,3750,0,1,0,0,1
1,Adelie,39.5,17.4,186,3800,1,0,0,0,1
2,Adelie,40.3,18.0,195,3250,1,0,0,0,1
3,Adelie,36.7,19.3,193,3450,1,0,0,0,1
4,Adelie,39.3,20.6,190,3650,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
328,Chinstrap,55.8,19.8,207,4000,0,1,0,1,0
329,Chinstrap,43.5,18.1,202,3400,1,0,0,1,0
330,Chinstrap,49.6,18.2,193,3775,0,1,0,1,0
331,Chinstrap,50.8,19.0,210,4100,0,1,0,1,0


In [4]:
# Encode species (data mapping)

target_mapper = {'Chinstrap': 0, 'Gentoo': 1, 'Adelie': 2}

def target_encode(val):
    return target_mapper[val]

df['species'] = df['species'].apply(target_encode)

df


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,2,39.1,18.7,181,3750,0,1,0,0,1
1,2,39.5,17.4,186,3800,1,0,0,0,1
2,2,40.3,18.0,195,3250,1,0,0,0,1
3,2,36.7,19.3,193,3450,1,0,0,0,1
4,2,39.3,20.6,190,3650,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
328,0,55.8,19.8,207,4000,0,1,0,1,0
329,0,43.5,18.1,202,3400,1,0,0,1,0
330,0,49.6,18.2,193,3775,0,1,0,1,0
331,0,50.8,19.0,210,4100,0,1,0,1,0


In [5]:
X = df.drop('species', axis=1)
X

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181,3750,0,1,0,0,1
1,39.5,17.4,186,3800,1,0,0,0,1
2,40.3,18.0,195,3250,1,0,0,0,1
3,36.7,19.3,193,3450,1,0,0,0,1
4,39.3,20.6,190,3650,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
328,55.8,19.8,207,4000,0,1,0,1,0
329,43.5,18.1,202,3400,1,0,0,1,0
330,49.6,18.2,193,3775,0,1,0,1,0
331,50.8,19.0,210,4100,0,1,0,1,0


In [6]:
Y = df['species']
Y

0      2
1      2
2      2
3      2
4      2
      ..
328    0
329    0
330    0
331    0
332    0
Name: species, Length: 333, dtype: int64

In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X, Y)

RandomForestClassifier()

In [8]:
import pickle
pickle.dump(clf, open('penguins_clf.pkl', 'wb'))
print("Success!")

Success!
