# Chapter 5: Evaluating Predictive Performance

> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck 
>
> Code included in
>
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) 
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.


## Import required packages

In [None]:
%matplotlib inline
import math
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pylab as plt


!pip install scikit-plot
import scikitplot as skplt
!pip install dmba
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart, adjusted_r2_score, exhaustive_search, backward_elimination, forward_selection, AIC_score, BIC_score




## Load file and generate training and validation sets. 


* load the Toyota Corolla file
* remove variables Model, ID, Price, Fuel Type and COlor from the model
* create training and validation data sets
* fit a regression model predicting the price

In [None]:
#run for class
car_df = pd.read_csv("ToyotaCorolla.csv")

In [None]:
car_df.shape

In [None]:
car_df.head()

In [None]:
#run for class
excluded_columns = ('Price','Id','Model','Color','Fuel_Type')
outcome = 'Price'
predictors = [s for s in car_df.columns if s not in excluded_columns]

In [None]:
outcome

In [None]:
predictors

In [None]:
#run for class
X = car_df[predictors]
y = car_df[outcome]

In [None]:
X

In [None]:
#run for class
train_X, valid_X, train_y, valid_y = train_test_split(X,y, test_size=.3, random_state=10)

In [None]:
print(train_X.shape)
valid_X.shape

In [None]:
#run for class
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

LinearRegression()

In [None]:
car_lm.intercept_

In [None]:
car_lm.coef_

In [None]:
regressionSummary(train_y, car_lm.predict(train_X))

In [None]:
pred_y = car_lm.predict(train_X)

In [None]:
pred_y

In [None]:
adjusted_r2_score(train_y, pred_y, car_lm)

In [None]:
result_train = pd.DataFrame({'predicted': pred_y,
                       'actual': train_y,
                       'residuals': train_y - pred_y})

In [None]:
result_train

In [None]:
#run for class
car_lm_predict = car_lm.predict(valid_X)
car_lm_predict.shape

(431,)

In [None]:
#run for class
result = pd.DataFrame({'predicted': car_lm_predict,
                       'actual': valid_y,
                       'residual': valid_y - car_lm_predict})

In [None]:
result.head(10)

In [None]:
#build data accuracy
regressionSummary(train_y, pred_y)

In [None]:
#validation data accuracy
regressionSummary(valid_y, car_lm_predict)

In [None]:
#build data r2
adjusted_r2_score(train_y, pred_y, car_lm)

In [None]:
#validation data r2
adjusted_r2_score(valid_y, car_lm_predict, car_lm)

In [None]:
car_df.columns

In [None]:
predictors_2 = ['KM','Automatic','CC','Doors','Boardcomputer','Color']

In [None]:
X2 = pd.get_dummies(car_df[predictors_2], drop_first=True)

In [None]:
X2.head()

In [None]:
y2=car_df[outcome]

In [None]:
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X2, y2, test_size=.25, random_state=123)

In [None]:
car_lm2 = LinearRegression()
car_lm2.fit(train_X2, train_y2)

In [None]:
#reduced model validation summary
regressionSummary(valid_y2,car_lm2.predict(valid_X2))

In [None]:
#full model validation summary
regressionSummary(valid_y,car_lm.predict(valid_X))

In [None]:
#reduced model r2
adjusted_r2_score(valid_y2, car_lm2.predict(valid_X2), car_lm2)

In [None]:
#full model r2
adjusted_r2_score(valid_y, car_lm.predict(valid_X), car_lm)

## Create a distribution of the residuals

* calculate the residual values for the training set
* calculate the residual values for the validation set
* create a historgram of the residual values for normality check
* create a boxplot of the residuals for possible outlier check

In [None]:
pred_error_train = pd.DataFrame({'residual': train_y-car_lm.predict(train_X),
                                 'data_set': 'training'})
pred_error_valid = pd.DataFrame({'residual': valid_y-car_lm.predict(valid_X),
                                 'data_set': 'validation'})

In [None]:
pred_error_train.head(10)

In [None]:
pred_error_both = pred_error_train.append(pred_error_valid, ignore_index=True)

In [None]:
pred_error_both

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(10,5)
pred_error_train.hist(ax=ax[0])
pred_error_valid.hist(ax=ax[1])

In [None]:
pred_error_both.boxplot(by='data_set')

## Begin Part 2
## Variable Reduction



*   Preform an exhaustive search for the best model.
*   Preform a backward elimination
*   Preform a forward selection
*Examine best model and coefficients





In [None]:
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    pred_y = model.predict(train_X[variables])
    # we negate as score is optimized to be as low as possible
    return -adjusted_r2_score(train_y, pred_y, model)

allVariables = train_X.columns
results = exhaustive_search(allVariables, train_model, score_model)

data = []
for result in results:
    model = result['model']
    variables = result['variables']
    AIC = AIC_score(train_y, model.predict(train_X[variables]), model)
    
    d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}
    d.update({var: var in result['variables'] for var in allVariables})
    data.append(d)
pd.set_option('display.width', 100)
print(pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables))))
pd.reset_option('display.width')

In [None]:
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

In [None]:
# The initial model is the constant model - this requires special handling
# in train_model and score_model
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

In [None]:
insights = pd.DataFrame({'variable': best_variables,
                       'coefficient': best_model.coef_})
insights