# Exercises

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

---
## Acquire

In [2]:
# read local csv to df
cars = pd.read_csv('cars.csv')
# lowercase column names
cars.columns = [c.lower() for c in cars]
# set index to id
cars.set_index('id', inplace=True)

print('{} rows x {} cols'.format(*cars.shape))
cars.head()

297899 rows x 8 cols


Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


---
## Prepare

In [3]:
# avg price for specific car
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
# boolean, whether car was sold for more than avg or not
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)

In [4]:
# remove features that won't be used
cars.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

In [5]:
# encode categorical variables
from sklearn.preprocessing import LabelEncoder

for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

In [6]:
cars.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [80]:
# split data
X, y = cars.drop(columns='gt_avg'), cars.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

In [81]:
# validate and training sets
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=.3333, random_state=123)

In [82]:
round(X_train.shape[0]/len(cars), 2), round(X_validate.shape[0]/len(cars), 2), round(X_test.shape[0]/len(cars), 2)

(0.53, 0.27, 0.2)

In [83]:
X_train.columns.to_list(), y_train.name

(['year', 'mileage', 'state', 'make', 'model'], 'gt_avg')

### Decision Tree

#### Basic Cross Validation

In [11]:
# cross_val_score
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)

#default to accuracy
cross_val_score(tree, X_train, y_train, cv=3)

array([0.5915828 , 0.5926891 , 0.59055549])

#### GridSearchCV

In [12]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier()

grid = GridSearchCV(tree, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()



dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [13]:
# property 1
test_scores = results['mean_test_score']

In [14]:
# property 2
params = results['params']

In [15]:
#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
1,2,1.0,0.555634
7,4,1.0,0.556118
4,3,1.0,0.564086
2,2,3.0,0.577379
0,2,,0.591609
5,3,3.0,0.615739
8,4,3.0,0.620164
3,3,,0.628226
6,4,,0.637824


Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the cars dataset used in the lesson.

### K-Nearest Neighbors

#### Basic Cross Validation

In [16]:
# cross_val_score
knn = KNeighborsClassifier(n_neighbors=5)

#default to accuracy
cross_val_score(knn, X_train, y_train, cv=3)

array([0.55219682, 0.55537933, 0.55492617])

#### GridSearchCV

In [17]:
params = {'n_neighbors': [5, 10, 20]}

knn = KNeighborsClassifier()

grid = GridSearchCV(knn, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()



dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [18]:
# property 1
test_scores = results['mean_test_score']

In [19]:
# property 2
params = results['params']

In [20]:
#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,n_neighbors,score
0,5,0.554167
1,10,0.5642
2,20,0.57235


### Logistic Regression

#### Basic Cross Validation

In [21]:
# cross_val_score
logit = LogisticRegression(random_state=123)

#default to accuracy
cross_val_score(logit, X_train, y_train, cv=3)

array([0.58391707, 0.58997017, 0.58947925])

#### GridSearchCV

In [38]:
params = {'penalty': ['l2', 'l1', 'none'],
          'solver': ['lbfgs', 'liblinear'],
         'C': [1, .0001, 1000]}

logit = LogisticRegression(random_state=123)

grid = GridSearchCV(logit, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: penalty='none' is not supported for the liblinear solver

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
ValueError: penalty='none' is not supported for the liblinear solver

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
ValueError: penalty='none' is not supported for the liblinear solver



dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_penalty', 'param_solver', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [39]:
# property 1
test_scores = results['mean_test_score']

In [40]:
# property 2
params = results['params']

In [41]:
#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,C,penalty,solver,score
0,1.0,l2,lbfgs,0.587789
4,1.0,none,lbfgs,0.587789
10,0.0001,none,lbfgs,0.587789
12,1000.0,l2,lbfgs,0.587789
16,1000.0,none,lbfgs,0.587789
9,0.0001,l1,liblinear,0.587845
7,0.0001,l2,liblinear,0.587883
1,1.0,l2,liblinear,0.587896
13,1000.0,l2,liblinear,0.587896
6,0.0001,l2,lbfgs,0.587921


### Random Forest

#### Basic Cross Validation

In [47]:
from sklearn.ensemble import RandomForestClassifier

# cross_val_score
rf = RandomForestClassifier(random_state=123) 

#default to accuracy
cross_val_score(rf, X_train, y_train, cv=3)

array([0.67171422, 0.67072618, 0.66931007])

#### GridSearchCV

In [56]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 3],
         'n_estimators': [8, 10]}

rf = RandomForestClassifier(random_state=123) 

grid = GridSearchCV(rf, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()



dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [57]:
# property 1
test_scores = results['mean_test_score']

In [58]:
# property 2
params = results['params']

In [59]:
#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,n_estimators,score
1,2,,10,0.60757
0,2,,8,0.609899
3,2,3.0,10,0.611988
2,2,3.0,8,0.616791
4,3,,8,0.638013
5,3,,10,0.638492
7,3,3.0,10,0.644206
6,3,3.0,8,0.645704
9,4,,10,0.650399
10,4,3.0,8,0.6505


## Evaluate

In [93]:
# cross_val_score
rf = RandomForestClassifier(max_depth=4,
                            max_features=3,
                            random_state=123) 

# cross_val_score
logit = LogisticRegression(penalty='l1',
                           solver='liblinear',
                           random_state=123)

tree = DecisionTreeClassifier(max_depth=4,
                              random_state=123)

**Train**

In [88]:
#default to accuracy
round(cross_val_score(logit, X_train, y_train, cv=3).mean(), 4)

0.6591

In [89]:
#default to accuracy
round(cross_val_score(rf, X_train, y_train, cv=3).mean(), 4)

0.6554

In [90]:
#default to accuracy
round(cross_val_score(tree, X_train, y_train, cv=3).mean(), 4)

0.6424

In [98]:
logit.fit(X_train, y_train)
rf.fit(X_train, y_train)
tree.fit(X_train, y_train)

print(logit.score(X_train, y_train))
print(rf.score(X_train, y_train))
print(tree.score(X_train, y_train))

0.6592609842214907
0.6577819456594939
0.642462882425875


**Validate**

In [99]:
y_pred_lr = logit.predict(X_validate)
y_pred_rf = rf.predict(X_validate)
y_pred_dt = tree.predict(X_validate)

In [100]:
print(logit.score(X_validate, y_validate))
print(rf.score(X_validate, y_validate))
print(tree.score(X_validate, y_validate))

0.6593438412730386
0.6571658777318965
0.6429776412528956


**Test**

In [101]:
logit.score(X_test, y_test)

0.6571332661967103

> **Conclusion**: the best model I found through cross validation was a Logistic Regression Model (hyperparmeters: penalty='l1', solver='liblinear', all else defaults) the accuracy of this model is correct about 66% of the time.