In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
from pydataset import data
from sklearn.model_selection import train_test_split

In [3]:
# obtain/acquire our dataframe
df = data('iris')

In [4]:
# prepare our data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [6]:
# data looks ok at first glance -- we will proceed (with caution)
# split our data:
train, test = train_test_split(df, train_size=0.8, random_state=1349, stratify=df.Species)

In [7]:
train, validate = train_test_split(train, train_size=0.7, random_state=1349, stratify=train.Species)

In [8]:
train.shape, validate.shape, test.shape

((84, 5), (36, 5), (30, 5))

In [9]:
# separate our X and y (independent vs dependent variables)
X_train = train.drop(columns='Species')

In [10]:
y_train = train[['Species']]

In [14]:
# Let's make our model!

In [15]:
# make our thing
clf = RandomForestClassifier()

In [16]:
# fit the thing (ONLY ON TRAIN!!)
clf.fit(X_train, y_train)

RandomForestClassifier()

In [17]:
# use the thing (on in-sample data)
y_pred = clf.predict(X_train)

In [18]:
y_pred

array(['virginica', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'setosa',
       'setosa', 'setosa', 'virginica', 'setosa', 'virginica',
       'virginica', 'versicolor', 'virginica', 'versicolor', 'versicolor',
       'setosa', 'versicolor', 'setosa', 'virginica', 'versicolor',
       'setosa', 'virginica', 'setosa', 'virginica', 'setosa',
       'versicolor', 'virginica', 'setosa', 'setosa', 'virginica',
       'setosa', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'versicolor', 'virginica', 'versicolor', 'setosa', 'virginica',
       'virginica', 'setosa', 'virginica', 'virginica', 'virginica',
       'versicolor', 'setosa', 'setosa', 'setosa', 'virginica',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'versicolor',
       'setosa', 'virginica', 'versicolor', 'virginica', 'setosa',
       'virginica', 'virginica', 'setosa', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'setosa

In [22]:
import pandas as pd

In [23]:
pd.DataFrame(classification_report(y_train.Species, y_pred, output_dict=True))

Unnamed: 0,setosa,versicolor,virginica,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,28.0,28.0,28.0,1.0,84.0,84.0


In [24]:
clf.score(X_train, y_train)

1.0

In [26]:
y_pred_val = clf.predict(validate.drop(columns='Species'))

In [27]:
clf.score(validate.drop(columns='Species'), y_pred_val)

1.0

In [28]:
X_train

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
139,6.0,3.0,4.8,1.8
8,5.0,3.4,1.5,0.2
80,5.7,2.6,3.5,1.0
75,6.4,2.9,4.3,1.3
98,6.2,2.9,4.3,1.3
95,5.6,2.7,4.2,1.3
84,6.0,2.7,5.1,1.6
14,4.3,3.0,1.1,0.1
86,6.0,3.4,4.5,1.6
43,4.4,3.2,1.3,0.2


In [33]:
df = data('mpg')

In [34]:
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [35]:
train, test = train_test_split(df, train_size=0.8, random_state=1349, stratify=df.drv)
train, validate = train_test_split(train, train_size=0.7, random_state=1349, stratify=train.drv)

In [43]:
# make the thing
clf = RandomForestClassifier(max_depth=2)

In [44]:
# fit the thing:
# DONT fit on the entire dataset. Just train!
clf.fit(train[['displ', 'cty','hwy']], train.drv)

RandomForestClassifier(max_depth=2)

In [45]:
y_pred = clf.predict(train[['displ', 'cty','hwy']])

In [46]:
clf.score(train[['displ', 'cty','hwy']], train.drv)

0.7615384615384615

In [47]:
clf.score(validate[['displ', 'cty','hwy']], validate.drv)

0.8771929824561403

In [48]:
y_pred_proba = clf.predict_proba(train[['displ', 'cty','hwy']])

In [50]:
train.drv.value_counts()

f    59
4    57
r    14
Name: drv, dtype: int64

In [49]:
y_pred_proba

array([[0.84170648, 0.01793948, 0.14035405],
       [0.27514261, 0.67765344, 0.04720394],
       [0.85471497, 0.01948254, 0.12580249],
       [0.20425698, 0.76900917, 0.02673385],
       [0.20103922, 0.76971312, 0.02924766],
       [0.47899004, 0.1887132 , 0.33229677],
       [0.20081682, 0.75612978, 0.04305339],
       [0.45804124, 0.19404653, 0.34791223],
       [0.27493622, 0.66868315, 0.05638063],
       [0.36065778, 0.56414932, 0.07519291],
       [0.64203477, 0.04889035, 0.30907487],
       [0.3606881 , 0.49450305, 0.14480885],
       [0.79520073, 0.11094417, 0.09385511],
       [0.11000543, 0.88261442, 0.00738016],
       [0.25422909, 0.59221148, 0.15355943],
       [0.33511057, 0.59894045, 0.06594898],
       [0.48463599, 0.30057783, 0.21478618],
       [0.86542563, 0.01785463, 0.11671974],
       [0.40601983, 0.36256466, 0.23141552],
       [0.31340408, 0.60558038, 0.08101554],
       [0.76250472, 0.02067188, 0.2168234 ],
       [0.48354766, 0.32565149, 0.19080084],
       [0.