# Cross Validation

## Data Prep

In [1]:
import pandas as pd
import env

url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/numbers'

In [2]:
df = pd.read_sql('SELECT * FROM numbers_with_more_groups', url)
df

Unnamed: 0,n,category,supergroup
0,1,a,one
1,2,b,two
2,3,c,one
3,4,a,two
4,5,b,one
5,6,c,two
6,7,a,one
7,8,b,two
8,9,c,one
9,10,a,two


We can use `.transform` to return a dataframe / series with the same number of rows as the original dataframe. This means we can use `.transform` to add a new column to our data frame, based on some sub-group aggreate.

In [3]:
supergroup_means = df.groupby('supergroup').n.transform('mean')
df['supergroup_mean'] = supergroup_means
df

Unnamed: 0,n,category,supergroup,supergroup_mean
0,1,a,one,5
1,2,b,two,6
2,3,c,one,5
3,4,a,two,6
4,5,b,one,5
5,6,c,two,6
6,7,a,one,5
7,8,b,two,6
8,9,c,one,5
9,10,a,two,6


## Exercise

1. Obtain the `cars.csv` file from the [google classroom](https://classroom.google.com/u/1/c/Mjc3NjgxNDE5NjJa) and read it into python with pandas.
1. Create a feature named `gt_avg`, which should be either 1 or 0. The value should indicate whether or not a given price is greater than the average price for that car's combination of year, make, and model.
1. Drop the `Id`, `City`, and `Vin` columns.
1. Encode the categorical features as necessary. You might wish to use a `sklearn.preprocessing.LabelEncoder`.
1. Split the data into training and test sets.

In [4]:
cars = pd.read_csv('./cars.csv')
cars.columns = [col.lower() for col in cars]
print('%d rows X %d columns' % cars.shape)
cars.head()

297899 rows X 9 columns


Unnamed: 0,id,price,year,mileage,city,state,vin,make,model
0,1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
1,2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
2,3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
3,4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
4,5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [13]:
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)

cars.head()

Unnamed: 0,id,price,year,mileage,city,state,vin,make,model,avg_saleprice,gt_avg
0,1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
1,2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0
2,3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911,0
3,4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598,0
4,5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786,0


In [14]:
cars.drop(columns=['id', 'price', 'avg_saleprice', 'city', 'vin'], inplace=True)

In [18]:
from sklearn.preprocessing import LabelEncoder

for col in ['year', 'state', 'make', 'model']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

cars.head()

Unnamed: 0,year,mileage,state,make,model,gt_avg
0,18,18681,28,7,523,0
1,18,27592,19,7,525,0
2,18,13650,32,7,526,0
3,18,25195,22,7,525,0
4,18,22800,38,7,523,0


In [20]:
from sklearn.model_selection import train_test_split

X, y = cars.drop(columns='gt_avg'), cars.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

In [21]:
# The "manual" way
X_ttrain, X_validate, y_ttrain, y_validate = train_test_split(X_train, y_train)

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=4)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.64409617, 0.63872105, 0.64158663])

In [38]:
tree = DecisionTreeClassifier(max_depth=2)
cross_val_score(tree, X_train, y_train, cv=3, scoring='recall')

array([0.43113837, 0.43108457, 0.28007317])

## What happens in the multi-class scenario?

In [48]:
from pydataset import data
from sklearn.metrics import classification_report

iris = data('iris')
iris.columns = [c.lower().replace('.', '_') for c in iris]
iris.head()

X, y = iris.drop(columns='species'), iris.species

tree = DecisionTreeClassifier(max_depth=3)

tree.fit(X, y)

actual = y
predictions = tree.predict(X)

print(classification_report(actual, predictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        50
  versicolor       0.98      0.94      0.96        50
   virginica       0.94      0.98      0.96        50

   micro avg       0.97      0.97      0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150



In [50]:
cross_val_score(tree, X, y, scoring='precision_macro')



array([0.98148148, 0.92156863, 0.98039216])

## Grid Search

In [62]:
from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'max_depth': [3, 4, 5],
    'max_features': [None, 2, 3],
}

grid = GridSearchCV(DecisionTreeClassifier(), param_grid=hyperparameters, cv=3, scoring='precision')
grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [3, 4, 5], 'max_features': [None, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='precision', verbose=0)

In [63]:
results = grid.cv_results_

results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'mean_train_score', 'std_train_score'])

In [64]:
scores = results['mean_test_score']
scores

array([0.59311668, 0.61483331, 0.58703022, 0.62607496, 0.59110922,
       0.5983772 , 0.64353578, 0.59749773, 0.62181384])

In [65]:
params = results['params']
params

[{'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 2},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 2},
 {'max_depth': 4, 'max_features': 3},
 {'max_depth': 5, 'max_features': None},
 {'max_depth': 5, 'max_features': 2},
 {'max_depth': 5, 'max_features': 3}]

In [66]:
for s, p in zip(scores, params):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
2,3,3.0,0.58703
4,4,2.0,0.591109
0,3,,0.593117
7,5,2.0,0.597498
5,4,3.0,0.598377
1,3,2.0,0.614833
8,5,3.0,0.621814
3,4,,0.626075
6,5,,0.643536
