In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

In [30]:
from sklearn.linear_model import Ridge

In [31]:
model_name = "ridge_regression"
display_name = "Ridge Regression"

In [32]:
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)

In [33]:
def get_best_model(method, args, _cv, is_sample):
    sample_string = ''
    if (is_sample):
        sample_string = '_sample'
    y_train = pd.read_csv('y_train'+ sample_string + '.csv')    
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    print('\n\nRunning cross fold validation for',display_name,'with',method_string,'dataset')
    x_train = pd.read_csv('x_train' + method + sample_string + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = Pipeline([('clf', Ridge())])
    grid = GridSearchCV(pipe, args, cv = _cv, verbose = 1, scoring = 'r2', n_jobs=6)
    grid.fit(x_train, y_train.values.ravel())
    print('\nBest score for',display_name,'with',method_string,"dataset:",grid.best_score_)
    best_args = grid.best_estimator_.get_params()
    print('\nBest hyperparameters for',display_name,'with',method_string,'dataset:')
    for arg in best_args:
        print(arg,":",best_args[arg])
    y_test_predict = grid.predict(x_test)
    mse = mean_squared_error(y_test,y_test_predict)
    mae = mean_absolute_error(y_test,y_test_predict)
    r2 = r2_score(y_test,y_test_predict)
    print('\nPrediction scores for',display_name,'using',method_string,':')
    print('Mean Squared error:',mse)
    print('Mean Absolute error:',mae)
    print('R^2:',r2)
    return grid.best_estimator_,grid.best_score_,mse,mae,r2

In [34]:
def save_model(model,method):
    with open('CrimeGUI/Models/decision_tree'+method,'wb') as file:
        pickle.dump(model,file)

In [35]:
print(sel_methods)

['_f_regression' '_chi2' '_adaboost' '_equal_crime_and_business'
 '_all_business']


<h3>F-Regression feature selected dataset<h3>

In [36]:
is_sample = False
num_folds = 10
args = {}

In [37]:
f_reg_model, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = get_best_model(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for Ridge Regression with _f_regression dataset
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.



Best score for Ridge Regression with _f_regression dataset: 0.7440061247913899

Best hyperparameters for Ridge Regression with _f_regression dataset:
memory : None
steps : [('clf', Ridge())]
verbose : False
clf : Ridge()
clf__alpha : 1.0
clf__copy_X : True
clf__fit_intercept : True
clf__max_iter : None
clf__normalize : False
clf__random_state : None
clf__solver : auto
clf__tol : 0.001

Prediction scores for Ridge Regression using _f_regression :
Mean Squared error: 4.842362147889453
Mean Absolute error: 1.3790856484948688
R^2: 0.751574939883525


[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    1.3s finished


In [38]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,
                     2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,3,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__solver':('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'),
        'clf__random_state':([7]),
        'clf__tol':(0.001, 0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01, 0.02,0.03,0.04,0.05,0.06,
                    0.07,0.08,0.09,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)}

In [39]:
f_reg_model, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = get_best_model(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for Ridge Regression with _f_regression dataset
Fitting 10 folds for each of 62720 candidates, totalling 627200 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 100 tasks      | elapsed:    0.4s
[Parallel(n_jobs=6)]: Done 1300 tasks      | elapsed:    6.4s
[Parallel(n_jobs=6)]: Done 2808 tasks      | elapsed:   20.1s
[Parallel(n_jobs=6)]: Done 5390 tasks      | elapsed:   43.9s
[Parallel(n_jobs=6)]: Done 9026 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 13482 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done 17944 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done 23486 tasks      | elapsed:  3.4min
[Parallel(n_jobs=6)]: Done 31114 tasks      | elapsed:  4.4min
[Parallel(n_jobs=6)]: Done 38234 tasks      | elapsed:  5.3min
[Parallel(n_jobs=6)]: Done 45196 tasks      | elapsed:  6.4min
[Parallel(n_jobs=6)]: Done 52836 tasks      | elapsed:  7.6min
[Parallel(n_jobs=6)]: Done 60800 tasks      | elapsed:  8.9min
[Parallel(n_jobs=6)]: Done 68438 tasks      | elapsed: 10.1min
[Parallel(n_jobs=6)]: Done 78762 tasks      | ela


Best score for Ridge Regression with _f_regression dataset: 0.7441120478633996

Best hyperparameters for Ridge Regression with _f_regression dataset:
memory : None
steps : [('clf', Ridge(alpha=4, random_state=7, solver='saga', tol=0.09))]
verbose : False
clf : Ridge(alpha=4, random_state=7, solver='saga', tol=0.09)
clf__alpha : 4
clf__copy_X : True
clf__fit_intercept : True
clf__max_iter : None
clf__normalize : False
clf__random_state : 7
clf__solver : saga
clf__tol : 0.09

Prediction scores for Ridge Regression using _f_regression :
Mean Squared error: 4.834785382725304
Mean Absolute error: 1.375431242708831
R^2: 0.7519636465279076


In [61]:
save_model(f_reg_model,sel_methods[0])

<h3>Chi-Squared feature selected dataset<h3>

In [40]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,
                     2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,3,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__solver':('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'),
        'clf__random_state':([7]),
        'clf__tol':(0.001, 0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01, 0.02,0.03,0.04,0.05,0.06,
                    0.07,0.08,0.09,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)}

In [None]:
chi2_model, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = get_best_model(sel_methods[1], args, num_folds, is_sample)



Running cross fold validation for Ridge Regression with _chi2 dataset
Fitting 10 folds for each of 62720 candidates, totalling 627200 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 100 tasks      | elapsed:    0.4s
[Parallel(n_jobs=6)]: Done 1300 tasks      | elapsed:    7.1s
[Parallel(n_jobs=6)]: Done 2634 tasks      | elapsed:   21.0s
[Parallel(n_jobs=6)]: Done 4324 tasks      | elapsed:   42.2s
[Parallel(n_jobs=6)]: Done 7502 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 11376 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done 16126 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done 20680 tasks      | elapsed:  3.2min
[Parallel(n_jobs=6)]: Done 26550 tasks      | elapsed:  4.1min
[Parallel(n_jobs=6)]: Done 33253 tasks      | elapsed:  5.2min
[Parallel(n_jobs=6)]: Done 40510 tasks      | elapsed:  6.2min
[Parallel(n_jobs=6)]: Done 49182 tasks      | elapsed:  7.5min
[Parallel(n_jobs=6)]: Done 59318 tasks      | elapsed:  9.0min
[Parallel(n_jobs=6)]: Done 70328 tasks      | elapsed: 10.6min
[Parallel(n_jobs=6)]: Done 81564 tasks      | ela

In [62]:
save_model(chi2_model,sel_methods[1])

<h3>AdaBoost feature selected dataset<h3>

In [None]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,
                     2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,3,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__solver':('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'),
        'clf__random_state':([7]),
        'clf__tol':(0.001, 0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01, 0.02,0.03,0.04,0.05,0.06,
                    0.07,0.08,0.09,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)}

In [None]:
ada_model, ada_train_score, ada_mse, ada_mae, ada_r2 = get_best_model(sel_methods[2], args, num_folds, is_sample)

In [63]:
save_model(ada_model,sel_methods[2])

<h3>Equal business and crime feature selected dataset<h3>

In [None]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,
                     2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,3,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__solver':('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'),
        'clf__random_state':([7]),
        'clf__tol':(0.001, 0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01, 0.02,0.03,0.04,0.05,0.06,
                    0.07,0.08,0.09,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)}

In [None]:
equal_model, equal_train_score, equal_mse, equal_mae, equal_r2 = get_best_model(sel_methods[3], args, num_folds, is_sample)

In [64]:
save_model(equal_model,sel_methods[3])

<h3>Business only feature selected dataset<h3>

In [None]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,
                     2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,3,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__solver':('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'),
        'clf__random_state':([7]),
        'clf__tol':(0.001, 0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01, 0.02,0.03,0.04,0.05,0.06,
                    0.07,0.08,0.09,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)}

In [None]:
bus_model, bus_train_score, bus_mse, bus_mae, bus_r2 = get_best_model(sel_methods[4], args, num_folds, is_sample)

In [65]:
save_model(bus_model,sel_methods[4])

<h3>Visualise scores<h3>

In [70]:
training_scores = [f_reg_train_score, chi2_train_score, ada_train_score, equal_train_score, bus_train_score]
mse_scores = [f_reg_mse, chi2_mse, ada_mse, equal_mse, bus_mse]
mae_scores = [f_reg_mae, chi2_mae, ada_mae, equal_mae, bus_mae]
r2_scores = [f_reg_r2, chi2_r2, ada_r2, equal_r2, bus_r2]
datasets = ["F-Regression","Chi-Squared","AdaBoost","Equal Business\nand Crime","All Business"]
y_select = np.arange(len(datasets))

In [10]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree training score")
plt.xlabel("Dataset")
plt.ylabel("Training score (r-squared)")
plt.bar(y_select, training_scores)

NameError: name 'plt' is not defined

In [11]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree testing mean-squared error")
plt.xlabel("Dataset")
plt.ylabel("Mean-squared error")
plt.bar(y_select, mse_scores)

NameError: name 'plt' is not defined

In [12]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree testing mean-absolute error")
plt.xlabel("Dataset")
plt.ylabel("Mean absolute error")
plt.bar(y_select, mae_scores)

NameError: name 'plt' is not defined

In [13]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree testing r-squared scores")
plt.xlabel("Dataset")
plt.ylabel("R-Squared Score")
plt.bar(y_select, r2_scores)

NameError: name 'plt' is not defined