In [None]:
import numpy as np
from numpy import arange
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

<h1> Import relevant model below then delete this cell <h1>

In [None]:
from sklearn.tree import DecisionTreeRegressor

<h1> Update model name and display name below then delete this cell <h1>

In [None]:
model_name = "decision_tree"
display_name = "Decision Tree"

In [None]:
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)

<h1> Update model in pipeline on line 13 then delete this cell <h1>

In [None]:
# train and test machine learning model using GridSearchCV on all combinations of provided hyperparameters
def get_best_model(method, args, _cv, is_sample):
    sample_string = ''
    if (is_sample):
        sample_string = '_sample'
    y_train = pd.read_csv('y_train'+ sample_string + '.csv')    
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    print('\n\nRunning cross fold validation for',display_name,'with',method_string,'dataset')
    x_train = pd.read_csv('x_train' + method + sample_string + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = Pipeline([('clf', DecisionTreeRegressor())])
    grid = GridSearchCV(pipe, args, cv = _cv, verbose = 1, scoring = 'r2', n_jobs=6)
    grid.fit(x_train, y_train.values.ravel())
    print('\nBest score for',display_name,'with',method_string,"dataset:",grid.best_score_)
    best_args = grid.best_estimator_.get_params()
    print('\nBest hyperparameters for',display_name,'with',method_string,'dataset:')
    for arg in best_args:
        print(arg,":",best_args[arg])
    y_test_predict = grid.predict(x_test)
    mse = mean_squared_error(y_test,y_test_predict)
    mae = mean_absolute_error(y_test,y_test_predict)
    r2 = r2_score(y_test,y_test_predict)
    print('\nPrediction scores for',display_name,'using',method_string,':')
    print('Mean Squared error:',mse)
    print('Mean Absolute error:',mae)
    print('R^2:',r2)
    return grid.best_estimator_,grid.best_score_,mse,mae,r2

In [None]:
# save trained machine learning model with best hyperparameters
def save_model(model,method):
    with open('CrimeGUI/Models/'+model_name+method,'wb') as file:
        pickle.dump(model,file)

<h3>F-Regression feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
args = {}

In [None]:
f_reg_model, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = get_best_model(sel_methods[0], args, num_folds, is_sample)

In [None]:
#example args
#is_sample = True
#num_folds = 5
#args = {'clf__criterion':('mse','friedman_mse','mae','poisson'),
#        'clf__splitter':('best','random'),
#        'clf__min_samples_split':(30,31,32,33,34,35,36,37,38,39,40),
#        'clf__min_samples_leaf':((arange(10, 20, 1))), #from 10 to 10 increment by 1
#        'clf__max_features':('sqrt','log2'),
#        'clf__random_state':([7]),
#        'clf__min_impurity_decrease':(0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9)}

In [None]:
save_model(f_reg_model,sel_methods[0])

<h3>Chi-Squared feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
args = {}

In [None]:
chi2_model, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = get_best_model(sel_methods[1], args, num_folds, is_sample)

In [None]:
save_model(chi2_model,sel_methods[1])

<h3>AdaBoost feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
args = {}

In [None]:
ada_model, ada_train_score, ada_mse, ada_mae, ada_r2 = get_best_model(sel_methods[2], args, num_folds, is_sample)

In [None]:
save_model(ada_model,sel_methods[2])

<h3>Equal business and crime feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
args = {}

In [None]:
equal_model, equal_train_score, equal_mse, equal_mae, equal_r2 = get_best_model(sel_methods[3], args, num_folds, is_sample)

In [None]:
save_model(equal_model,sel_methods[3])

<h3>Business only feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
args = {}

In [None]:
bus_model, bus_train_score, bus_mse, bus_mae, bus_r2 = get_best_model(sel_methods[4], args, num_folds, is_sample)

In [None]:
save_model(bus_model,sel_methods[4])

<h3>Visualise scores<h3>

In [None]:
training_scores = [f_reg_train_score, chi2_train_score, ada_train_score, equal_train_score, bus_train_score]
mse_scores = [f_reg_mse, chi2_mse, ada_mse, equal_mse, bus_mse]
mae_scores = [f_reg_mae, chi2_mae, ada_mae, equal_mae, bus_mae]
r2_scores = [f_reg_r2, chi2_r2, ada_r2, equal_r2, bus_r2]
datasets = ["F-Regression","Chi-Squared","AdaBoost","Equal Business\nand Crime","All Business"]
y_select = np.arange(len(datasets))

In [None]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title(display_name + " tree training score")
plt.xlabel("Dataset")
plt.ylabel("Training score (r-squared)")
plt.bar(y_select, training_scores)

In [None]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title(display_name + " testing mean-squared error")
plt.xlabel("Dataset")
plt.ylabel("Mean-squared error")
plt.bar(y_select, mse_scores)

In [None]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title(display_name + " testing mean-absolute error")
plt.xlabel("Dataset")
plt.ylabel("Mean absolute error")
plt.bar(y_select, mae_scores)

In [None]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title(display_name + " testing r-squared scores")
plt.xlabel("Dataset")
plt.ylabel("R-Squared Score")
plt.bar(y_select, r2_scores)