# Exercise 11

## Car Price Prediction

Predict if the price of a car is low or high

In [21]:
import numpy as np
import random
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
import math

In [2]:
%matplotlib inline
import pandas as pd

data = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTrain_carListings.zip')
data = data.loc[data['Model'].str.contains('Camry')].drop(['Make', 'State'], axis=1)
data = data.join(pd.get_dummies(data['Model'], prefix='M'))
data['HighPrice'] = (data['Price'] > data['Price'].mean()).astype(int)
data = data.drop(['Model', 'Price'], axis=1)

data.head()

Unnamed: 0,Year,Mileage,M_Camry,M_Camry4dr,M_CamryBase,M_CamryL,M_CamryLE,M_CamrySE,M_CamryXLE,HighPrice
15,2016,29242,0,0,0,0,1,0,0,1
47,2015,26465,0,0,0,0,1,0,0,1
85,2012,46739,0,1,0,0,0,0,0,1
141,2017,41722,0,0,0,0,0,1,0,1
226,2014,77669,0,0,0,0,0,0,1,0


In [3]:
data.shape

(13150, 10)

In [4]:
y = data['HighPrice']
X = data.drop(['HighPrice'], axis=1)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [6]:
def gini(y):
    if y.shape[0] == 0:
        return 0
    else:
        return 1 - (y.mean()**2 + (1 - y.mean())**2)

In [7]:
def gini_impurity(X_col, y, split):
    "Calculate the gain of an split k on feature j"
    
    filter_l = X_col < split
    y_l = y.loc[filter_l]
    y_r = y.loc[~filter_l]
    
    n_l = y_l.shape[0]
    n_r = y_r.shape[0]
    
    gini_y = gini(y)
    gini_l = gini(y_l)
    gini_r = gini(y_r)
    
    gini_impurity_ = gini_y - (n_l / (n_l + n_r) * gini_l + n_r / (n_l + n_r) * gini_r)
    
    return gini_impurity_

In [8]:
def best_split(X, y, num_pct=10):
    
    features = range(X.shape[1])
    
    best_split = [0, 0, 0]  # j, split, gain
    
    # For all features
    for j in features:
        
        splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / (num_pct+1)).tolist())
        splits = np.unique(splits)[1:]
        
        # For all splits
        for split in splits:
            gain = gini_impurity(X.iloc[:, j], y, split)
                        
            if gain > best_split[2]:
                best_split = [j, split, gain]
    
    return best_split

In [18]:
def tree_grow(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10, n_features=None, seed=None):
    
    #print (level)
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    
    if seed is not None:
        np.random.seed(seed)
    
    # Calculate the best split
    if n_features is not None:
        samples = np.random.choice(a=len(X.columns)-1, size=n_features, replace=False) 
       # print ("sal",samples)
        j, split, gain = best_split(X.iloc[:,samples], y,  num_pct)
        j = samples [j]
    else:
        j, split, gain = best_split(X, y, num_pct)
    
   
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree  
    
    # No stooping criteria was meet, then continue to create the partition
    filter_l = X.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,n_features=n_features, seed=seed)
    tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,n_features=n_features, seed=seed)
    
    return tree

In [10]:
# Aquí no se dividió X_train y test, por eso se metió el X. Debería ser X_test
def tree_predict(X, tree, proba=False):
    
    predicted = np.ones(X.shape[0])

    # Check if final node
    if tree['split'] == -1:
        if not proba:
            predicted = predicted * tree['y_pred']
        else:
            predicted = predicted * tree['y_prob']
            
    else:
        
        j, split = tree['split']
        filter_l = (X.iloc[:, j] < split)
        X_l = X.loc[filter_l]
        X_r = X.loc[~filter_l]

        if X_l.shape[0] == 0:  # If left node is empty only continue with right
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
        elif X_r.shape[0] == 0:  # If right node is empty only continue with left
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
        else:
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)

    return predicted

# Exercise 11.1

Estimate a Decision Tree Classifier Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

In [23]:
tree = tree_grow(X_train, y_train, level=0, min_gain=0.001, max_depth=None, num_pct=10)

In [24]:
y_pred = tree_predict(X_test, tree)

In [25]:
metrics.accuracy_score(y_test, y_pred)

0.8433179723502304

# Exercise 11.2

Estimate a Bagging of 10 Decision Tree Classifiers Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

In [14]:
np.random.seed(7351)
n_samples = X_train.shape[0]
n_B = 10

samples = [np.random.choice(a=n_samples, size=n_samples, replace=True) for _ in range(1, n_B +1 )]

In [15]:
y_pred = pd.DataFrame(index=X_test.index, columns=[list(range(n_B))])
for i, sample in enumerate(samples):
    X_trainB = X_train.iloc[sample]
    y_trainB = y_train.iloc[sample]
    tree = tree_grow(X_trainB, y_trainB, level=0, min_gain=0.001, max_depth=None, num_pct=10)
    y_pred[i] = tree_predict(X_test, tree)

In [16]:
y_pred_ = (y_pred.sum(axis=1) >= (10 / 2)).astype(np.int)

print(f'''Accuracy Score: {metrics.accuracy_score(y_pred_, y_test)}''')

Accuracy Score: 0.8555299539170507


# Exercise 11.3

Implement the variable max_features on the Decision Tree Classifier created in 11.1.

Compare the impact in the results by varing the parameter max_features

Evaluate the accuracy on the testing set

In [29]:
for i in range(1,6):
    tree = tree_grow(X_train, y_train, level=0, min_gain=0.001, max_depth=5, num_pct=10, n_features=i, seed=12)
    y_pred =tree_predict(X_test, tree)
    print(f'''Accuracy Score: {round(metrics.accuracy_score(y_test , y_pred),3)} con n_features = {i}''')
    print('-'*20)

Accuracy Score: 0.582 con n_features = 1
--------------------
Accuracy Score: 0.582 con n_features = 2
--------------------
Accuracy Score: 0.857 con n_features = 3
--------------------
Accuracy Score: 0.859 con n_features = 4
--------------------
Accuracy Score: 0.874 con n_features = 5
--------------------


# Exercise 11.4

Estimate a Bagging of 10 Decision Tree Classifiers with `max_features = log(n_features)`

Evaluate the accuracy on the testing set

In [32]:
features = int(round(math.log(X_train.shape[1]),0))
accuracy_score = []
for i in range(10):
    X_train_n = X_train.iloc[samples[i],:]
    y_train_n = y_train.iloc[samples[i]]
    tree = tree_grow(X_train_n, y_train_n, level=0, min_gain=0.001, max_depth=5, num_pct=10, n_features=features)
    y_pred=tree_predict(X_test, tree)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    accuracy_score.append(accuracy)

accuracy_score=pd.DataFrame(accuracy_score,columns=['Accuracy_Score'])
print(accuracy_score.loc[accuracy_score['Accuracy_Score'].idxmax()])

Accuracy_Score    0.863825
Name: 0, dtype: float64


# Exercise 11.5

Using sklearn, train a RandomForestClassifier

Evaluate the accuracy on the testing set

In [17]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
y_pred = rf.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.8377880184331797

# Exercise 11.6

Find the best parameters of the RandomForestClassifier (max_depth, max_features, n_estimators)

Evaluate the accuracy on the testing set

In [19]:
max_depth_range = range(1, 21)
accuracy_scores = []
for depth in max_depth_range:
    rf = RandomForestClassifier(max_depth=depth, random_state=7531)
    accuracy_scores.append([depth, cross_val_score(rf, X_train, y_train, cv=10, scoring='accuracy').mean()])

In [20]:
accuracy_scores = pd.DataFrame(accuracy_scores, columns=['Depth', 'Accuracy Score'])
accuracy_scores.loc[accuracy_scores['Accuracy Score'].idxmax()]

Depth             8.000000
Accuracy Score    0.883764
Name: 7, dtype: float64

In [21]:
feature_range = range(1, len(X.columns)+1)
accuracy_scores = []
for feature in feature_range:
    rf = RandomForestClassifier(max_features=feature, random_state=7531, n_jobs=-1)
    accuracy_scores.append([feature, cross_val_score(rf, X_train, y_train, cv=10, scoring='accuracy').mean()])

In [22]:
accuracy_scores = pd.DataFrame(accuracy_scores, columns=['N Features', 'Accuracy Score'])
accuracy_scores.loc[accuracy_scores['Accuracy Score'].idxmax()]

N Features        7.00000
Accuracy Score    0.83984
Name: 6, dtype: float64

In [23]:
estimator_range = range(10, 310, 10)
accuracy_scores = []
for estimator in estimator_range:
    rf = RandomForestClassifier(n_estimators=estimator, random_state=7531, n_jobs=-1)
    accuracy_scores.append([estimator, cross_val_score(rf, X_train, y_train, cv=10, scoring='accuracy').mean()])

In [24]:
accuracy_scores = pd.DataFrame(accuracy_scores, columns=['N Estimators', 'Accuracy Score'])
accuracy_scores.loc[accuracy_scores['Accuracy Score'].idxmax()]

N Estimators      20.000000
Accuracy Score     0.837455
Name: 1, dtype: float64

In [25]:
accuracy_scores = []
rf = RandomForestClassifier(max_depth=8, max_features=7, n_estimators=20,random_state=7531, n_jobs=-1)
accuracy_scores.append([cross_val_score(rf, X_train, y_train, cv=100, scoring='accuracy').mean()])

In [26]:
accuracy_scores

[[0.8812297177509305]]

In [27]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=7531, verbose=0,
            warm_start=False)

In [28]:
y_pred = rf.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.8794930875576037