In [1]:
import numpy as np
import pandas as pd
from pydataset import data

from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.metrics import accuracy_score, classification_report

**Planning phase** 
Let's examine the Iris dataset

Can we accurately predict the species of Iris flower based on measurements recorded of the sepal and petals of the flowers?

**Acquisition** Let's get the iris dataset

In [3]:
# get iris from the pydataset
df = data('iris')

In [4]:
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa


**Prep** Let's clean up our data set

Rename the columns into something a little more legible

In [5]:
# rename columns by assigning them into the df.columns property
# we could also use the rename method
df.columns = ['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'species']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sepal_len  150 non-null    float64
 1   sepal_wid  150 non-null    float64
 2   petal_len  150 non-null    float64
 3   petal_wid  150 non-null    float64
 4   species    150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [7]:
# split the data

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# split our data into train+validate set and our test set
# we establish a training size of 80% of our data 
#"train" size here is train AND validate
# set a random seed to replicate results in the future
# stratify on our target to balance our data set for classification
train_val, test = train_test_split(df,
                                  train_size = 0.8,
                                  random_state=1349,
                                  stratify=df.species)

In [10]:
#repeat the process above to split train + validate
# into train, and validate separately
train, val = train_test_split(train_val,
                                  train_size = 0.7,
                                  random_state=1349,
                                  stratify=train_val.species)

In [11]:
train.head()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,species
139,6.0,3.0,4.8,1.8,virginica
8,5.0,3.4,1.5,0.2,setosa
80,5.7,2.6,3.5,1.0,versicolor
75,6.4,2.9,4.3,1.3,versicolor
98,6.2,2.9,4.3,1.3,versicolor


In [12]:
#split my X from my y

In [13]:
# x train is everything in the dataframe train that is NOT our target (y)
X_train = train.drop(columns=['species'])
# y train is everything in train that is our target (one column)
# 
# using double brackets to define my y_train:
# this allows us to define a single Series as a DataFrame, 
# rather than just a pd.Series object
y_train = train[['species']]

**Notes from explore** :
Petal length and width seemed to be the biggest drivers based on visual and statistical exploration, so we will chooose these two features as our primary/first features to feed into our models

**Modeling**

In [14]:
type(y_train)

pandas.core.frame.DataFrame

In [15]:
type(train['species'])

pandas.core.series.Series

In [16]:
y_train.species.value_counts()

setosa        28
versicolor    28
virginica     28
Name: species, dtype: int64

In [17]:
y_train['baseline'] = 'setosa'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['baseline'] = 'setosa'


In [18]:
baseline_score = accuracy_score(y_train.species, y_train.baseline)

In [19]:
baseline_score

0.3333333333333333

**Model creation**

step one:
**Create the thing**
step two:
**fit the thing**
step three:
**use the thing**

In [20]:
# create the thing

In [21]:
selected_feats = ['petal_wid', 'petal_len']

In [22]:
# create the classifier object
clf = RandomForestClassifier(random_state=1349)

## Big important note:

**ONLY FIT YOUR MODEL ON YOUR TRAINING DATA**

In [24]:
selected_feats

['petal_wid', 'petal_len']

In [25]:
# fit the thing:
# invoke the fit method, don't assign it to anything
# this will train our model
clf.fit(X_train[selected_feats], y_train.species)

RandomForestClassifier(random_state=1349)

In [28]:
# use the thing!
y_train['y_pred'] = clf.predict(X_train[selected_feats])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['y_pred'] = clf.predict(X_train[selected_feats])


In [29]:
y_train.head()

Unnamed: 0,species,baseline,y_pred
139,virginica,setosa,virginica
8,setosa,setosa,setosa
80,versicolor,setosa,versicolor
75,versicolor,setosa,versicolor
98,versicolor,setosa,versicolor


In [30]:
accuracy_score(y_train.species, y_train.y_pred)

1.0

In [31]:
# check the results against our holdout data:

In [32]:
# use the thing again, but on validate:

In [33]:
# separate X from y in validate
X_val = val.drop(columns=['species'])
y_val = val[['species']]

In [34]:
# JUST use predict, not refitting the model

In [36]:
y_pred_val = clf.predict(X_val[selected_feats])

In [37]:
# assign the array y_pred_val that holds our predictions on 
# the validation set, into our y_validation target dataframe
y_val['y_pred_val'] = y_pred_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val['y_pred_val'] = y_pred_val


In [38]:
accuracy_score(y_val.species, y_val.y_pred_val)

0.9722222222222222

In [42]:
pd.DataFrame(classification_report(y_val.species, 
                      y_val.y_pred_val, 
                      output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
setosa,1.0,1.0,1.0,12.0
versicolor,0.923077,1.0,0.96,12.0
virginica,1.0,0.916667,0.956522,12.0
accuracy,0.972222,0.972222,0.972222,0.972222
macro avg,0.974359,0.972222,0.972174,36.0
weighted avg,0.974359,0.972222,0.972174,36.0
