In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

state = 2310

# Introduction to Model Validation

To ensure the model is doing well we need to test our model on new, unseen data. 

In [3]:
candy = pd.read_csv('/Users/Dillon/OneDrive/Documents/DataCampML/candy-data.csv')
candy.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [7]:
X = candy.iloc[:,1:-1]
y = candy.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [20]:
rf = RandomForestRegressor(n_estimators = 100, max_depth = 6, random_state = state)
rf.fit(X_train, y_train)

train_preds = rf.predict(X_train)
test_preds = rf.predict(X_test)

train_error = mean_absolute_error(y_train, train_preds)
test_error = mean_absolute_error(y_test, test_preds)

print(train_error)
print(test_error)

4.283806791204942
8.94438703338014


In [24]:
for i, item in enumerate(rf.feature_importances_):
    print(X_train.columns[i], item)

chocolate 0.318043330696778
fruity 0.04272466630789982
caramel 0.022847021564037277
peanutyalmondy 0.03783001935704593
nougat 0.00792058805985727
crispedricewafer 0.013867773049893613
hard 0.014855955224786976
bar 0.04118561327382765
pluribus 0.03598249148958255
sugarpercent 0.19032326964377164
pricepercent 0.2744192713325194


In [26]:
ttt = pd.read_csv('/Users/Dillon/OneDrive/Documents/DataCampML/tic-tac-toe.csv')

Unnamed: 0,Top-Left,Top-Middle,Top-Right,Middle-Left,Middle-Middle,Middle-Right,Bottom-Left,Bottom-Middle,Bottom-Right,Class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [30]:
model_ttt = pd.get_dummies(ttt)

In [31]:
X = model_ttt.iloc[:,:-2]
y = model_ttt.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [33]:
rfc = RandomForestClassifier(random_state = state)
rfc.fit(X_train, y_train)
y_preds = rfc.predict(X_test)

pd.Series(y_preds).value_counts()

1    155
0     85
dtype: int64

When testing hyperparameters and model performance we should use a validation set. Get this by train_test_split twice. 

In [39]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = 0.3)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size = 0.2)

X_train.shape, X_test.shape, X_valid.shape

((536, 27), (288, 27), (134, 27))