# Cross Validation

In [21]:
#imports
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
### Read CSV
cars = pd.read_csv('cars.csv')
cars.columns = [c.lower() for c in cars]
cars.set_index('id', inplace=True)

print('{} rows x {} cols'.format(*cars.shape))

297899 rows x 8 cols


In [3]:
cars.head()

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [4]:
# Feature Engeneering, average salesprice, if price is greather than ave sales price
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)

In [5]:
cars.gt_avg.value_counts()

0    158520
1    139379
Name: gt_avg, dtype: int64

In [6]:
# Remove Features we do not neet
cars.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

In [7]:
cars.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2015,18681,MO,Buick,EncoreConvenience,0
2,2015,27592,IN,Buick,EncoreFWD,0
3,2015,13650,NC,Buick,EncoreLeather,0
4,2015,25195,LA,Buick,EncoreFWD,0
5,2015,22800,NV,Buick,EncoreConvenience,0


In [8]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297899 entries, 1 to 297899
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   year     297899 non-null  int64 
 1   mileage  297899 non-null  int64 
 2   state    297899 non-null  object
 3   make     297899 non-null  object
 4   model    297899 non-null  object
 5   gt_avg   297899 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 15.9+ MB


In [9]:
# Convert to categorical columns

for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

In [10]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297899 entries, 1 to 297899
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   year     297899 non-null  int64
 1   mileage  297899 non-null  int64
 2   state    297899 non-null  int64
 3   make     297899 non-null  int64
 4   model    297899 non-null  int64
 5   gt_avg   297899 non-null  int64
dtypes: int64(6)
memory usage: 15.9 MB


In [11]:
cars.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [12]:
# Split Data
X, y = cars.drop(columns='gt_avg'), cars.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [14]:
# Define our classifer with max_depth of 2, provide cross val score with three splits

tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.59118832, 0.57484894, 0.59344906])

In [15]:
# Defcault scoring metric is accuracy, but others can be specified

cross_val_score(tree, X_train, y_train, cv=3, scoring='precision')

array([0.59657894, 0.67405422, 0.58969621])

### Grid Search CV

Sklearn's grid search cross validation (GridSearchCV) class lets us quickly try out many different combinations of hyper parameters.

For our example, we'll try out different for max_depth and max_features with a decision tree classifier.

We'll specify the parameters we wish to use as a dictionary, then use that dictionary when we create the class.

In [22]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier()

grid = GridSearchCV(tree, params, cv=3, iid=True)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(), iid=True,
             param_grid={'max_depth': [2, 3, 4], 'max_features': [None, 1, 3]})

In [23]:
# We can now see the cross validation results in the cv_results_ property of the object we created.

results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

`There are a lot of properties here, but we will focus on two:`

**mean_test_score: the average test scores for each mode**
   
**params: a dictionary containing the parameters used to train each model**


In [24]:
test_scores = results['mean_test_score']
test_scores

array([0.58649541, 0.55459699, 0.58663808, 0.62741955, 0.55855387,
       0.60402234, 0.63373881, 0.58919767, 0.63412065])

In [25]:
params = results['params']
params

[{'max_depth': 2, 'max_features': None},
 {'max_depth': 2, 'max_features': 1},
 {'max_depth': 2, 'max_features': 3},
 {'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 1},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 1},
 {'max_depth': 4, 'max_features': 3}]

In [26]:
# We can combine these features together into a data frame to see how our different models perform:

for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
1,2,1.0,0.554597
4,3,1.0,0.558554
0,2,,0.586495
2,2,3.0,0.586638
7,4,1.0,0.589198
5,3,3.0,0.604022
3,3,,0.62742
6,4,,0.633739
8,4,3.0,0.634121
