In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


import warnings
warnings.filterwarnings('ignore')

In [4]:
## reading in the cars csv as a pandas dataframe

cars = pd.read_csv('cars.csv')
cars.columns = [c.lower() for c in cars]
cars.set_index('id', inplace=True)

## printing shape and previewing data frame

print('{} rows x {} cols\n'.format(*cars.shape))
cars.head()

297899 rows x 8 cols



Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


### Data Prepare

In [7]:
## making average saleprice and gt average columns

cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)

In [8]:
cars.head() ## previewing new columns

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model,avg_saleprice,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911,0
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598,0
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786,0


###  Removing Unwanted Features

In [9]:
## removing features we aren't going to use

cars.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

In [10]:
cars.head() ## previewing new dataframe

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2015,18681,MO,Buick,EncoreConvenience,0
2,2015,27592,IN,Buick,EncoreFWD,0
3,2015,13650,NC,Buick,EncoreLeather,0
4,2015,25195,LA,Buick,EncoreFWD,0
5,2015,22800,NV,Buick,EncoreConvenience,0


In [11]:
## encoding our categorical columns

from sklearn.preprocessing import LabelEncoder

for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

In [12]:
cars.head() ## previewing new dataframe

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [13]:
## doing our train test split

X, y = cars.drop(columns='gt_avg'), cars.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [14]:
## splitting our data further

X_train, X_validate, y_train, y_validate = train_test_split(X_train, 
                                                            y_train, 
                                                            test_size=.3333)


#### Cross Validating Decision Tree

In [15]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.57455205, 0.58921491, 0.59129187])

In [16]:
## scoring precision with cross validation

cross_val_score(tree, X_train, y_train, cv=3, scoring='precision')

array([0.6530837 , 0.58419842, 0.5940618 ])

#### Grid Search Cross Validation

In [17]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier()

grid = GridSearchCV(tree, params, cv=3)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4], 'max_features': [None, 1, 3]})

In [19]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [20]:
test_scores = results['mean_test_score']
test_scores

array([0.58501961, 0.53629937, 0.58260265, 0.63178234, 0.55640812,
       0.61724362, 0.63583555, 0.589312  , 0.62947879])

In [21]:
params = results['params']
params

[{'max_depth': 2, 'max_features': None},
 {'max_depth': 2, 'max_features': 1},
 {'max_depth': 2, 'max_features': 3},
 {'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 1},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 1},
 {'max_depth': 4, 'max_features': 3}]

In [22]:
## building a dataframe of our features

In [23]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
1,2,1.0,0.536299
4,3,1.0,0.556408
2,2,3.0,0.582603
0,2,,0.58502
7,4,1.0,0.589312
5,3,3.0,0.617244
8,4,3.0,0.629479
3,3,,0.631782
6,4,,0.635836


#### Cross Validating Random Forest

In [26]:
forest = RandomForestClassifier()

cross_val_score(tree, X_train, y_train, cv=3)

array([0.61199328, 0.61257128, 0.61325101])

In [27]:
## scoring precision with cross validation

cross_val_score(forest, X_train, y_train, cv=3, scoring='precision')

array([0.6549763 , 0.64995671, 0.65221191])

#### Grid Search Cross Validation

In [None]:
params = {'max_leaf_nodes': range(1, 10),
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier()

grid = GridSearchCV(forest, params, cv=3)

grid.fit(X_train, y_train)