In [3]:
import numpy as np
from numpy import arange
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

In [4]:
from sklearn.linear_model import ElasticNet

In [5]:
model_name = "elastic_net"
display_name = "Elastic Net"

In [6]:
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)

In [7]:
def get_best_model(method, args, _cv, is_sample):
    sample_string = ''
    if (is_sample):
        sample_string = '_sample'
    y_train = pd.read_csv('y_train'+ sample_string + '.csv')    
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    print('\n\nRunning cross fold validation for',display_name,'with',method_string,'dataset')
    x_train = pd.read_csv('x_train' + method + sample_string + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = Pipeline([('clf', ElasticNet())])
    grid = GridSearchCV(pipe, args, cv = _cv, verbose = 1, scoring = 'r2', n_jobs=6)
    grid.fit(x_train, y_train.values.ravel())
    print('\nBest score for',display_name,'with',method_string,"dataset:",grid.best_score_)
    best_args = grid.best_estimator_.get_params()
    print('\nBest hyperparameters for',display_name,'with',method_string,'dataset:')
    for arg in best_args:
        print(arg,":",best_args[arg])
    y_test_predict = grid.predict(x_test)
    mse = mean_squared_error(y_test,y_test_predict)
    mae = mean_absolute_error(y_test,y_test_predict)
    r2 = r2_score(y_test,y_test_predict)
    print('\nPrediction scores for',display_name,'using',method_string,':')
    print('Mean Squared error:',mse)
    print('Mean Absolute error:',mae)
    print('R^2:',r2)
    return grid.best_estimator_,grid.best_score_,mse,mae,r2

In [None]:
def save_model(model,method):
    with open('CrimeGUI/Models/'+model_name+method,'wb') as file:
        pickle.dump(model,file)

In [7]:
print(sel_methods)

['_f_regression' '_chi2' '_adaboost' '_equal_crime_and_business'
 '_all_business']


<h3>F-Regression feature selected dataset<h3>

In [12]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9),
        'clf__random_state':([7]),
        'clf__tol':(0.001, 0.005,0.01,0.02,0.04,0.06,0.08,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)}

In [13]:
f_reg_model, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = get_best_model(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _f_regression dataset
Fitting 10 folds for each of 97920 candidates, totalling 979200 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=6)]: Done 1132 tasks      | elapsed:    7.0s
[Parallel(n_jobs=6)]: Done 3132 tasks      | elapsed:   16.5s
[Parallel(n_jobs=6)]: Done 5932 tasks      | elapsed:   29.4s
[Parallel(n_jobs=6)]: Done 9532 tasks      | elapsed:   46.2s
[Parallel(n_jobs=6)]: Done 13932 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done 19132 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 25132 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 31932 tasks      | elapsed:  2.6min
[Parallel(n_jobs=6)]: Done 39532 tasks      | elapsed:  3.1min
[Parallel(n_jobs=6)]: Done 47932 tasks      | elapsed:  3.7min
[Parallel(n_jobs=6)]: Done 57132 tasks      | elapsed:  4.4min
[Parallel(n_jobs=6)]: Done 67132 tasks      | elapsed:  5.3min
[Parallel(n_jobs=6)]: Done 77932 tasks      | elapsed:  6.2min
[Parallel(n_jobs=6)]: Done 89532 tasks      | ela


Best score for Elastic Net with _f_regression dataset: 0.5614682772686381

Best hyperparameters for Elastic Net with _f_regression dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.1, fit_intercept=False, l1_ratio=0.9, normalize=True,
           positive=True, random_state=7, selection='random', tol=0.9))]
verbose : False
clf : ElasticNet(alpha=0.1, fit_intercept=False, l1_ratio=0.9, normalize=True,
           positive=True, random_state=7, selection='random', tol=0.9)
clf__alpha : 0.1
clf__copy_X : True
clf__fit_intercept : False
clf__l1_ratio : 0.9
clf__max_iter : 1000
clf__normalize : True
clf__positive : True
clf__precompute : False
clf__random_state : 7
clf__selection : random
clf__tol : 0.9
clf__warm_start : False

Prediction scores for Elastic Net using _f_regression :
Mean Squared error: 8.488073356509641
Mean Absolute error: 1.540361034917549
R^2: 0.5645410092297589


In [25]:
#poor accuracy, extended parameters
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.00001, 0.0001, 0.001, 0.001, 0.01, 0.0, 1.0, 10.0, 100.0),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0, 5, 0.2)),
        'clf__random_state':([7]),
        'clf__tol':(0.1, 0.3, 0.5, 0.7, 1)}

In [26]:
f_reg_model, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = get_best_model(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _f_regression dataset
Fitting 10 folds for each of 36000 candidates, totalling 360000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 tasks      | elapsed:    2.1s
[Parallel(n_jobs=6)]: Done 389 tasks      | elapsed:   15.4s
[Parallel(n_jobs=6)]: Done 1356 tasks      | elapsed:   22.2s
[Parallel(n_jobs=6)]: Done 2756 tasks      | elapsed:   31.0s
[Parallel(n_jobs=6)]: Done 4454 tasks      | elapsed:   47.7s
[Parallel(n_jobs=6)]: Done 6282 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done 8051 tasks      | elapsed:  4.2min
[Parallel(n_jobs=6)]: Done 9715 tasks      | elapsed:  7.3min
[Parallel(n_jobs=6)]: Done 15618 tasks      | elapsed:  8.1min
[Parallel(n_jobs=6)]: Done 21434 tasks      | elapsed:  8.8min
[Parallel(n_jobs=6)]: Done 25276 tasks      | elapsed:  9.6min
[Parallel(n_jobs=6)]: Done 28278 tasks      | elapsed: 13.2min
[Parallel(n_jobs=6)]: Done 33970 tasks      | elapsed: 16.3min
[Parallel(n_jobs=6)]: Done 42344 tasks      | elapsed: 17.2min
[Parallel(n_jobs=6)]: Done 45252 tasks      | elapsed


Best score for Elastic Net with _f_regression dataset: 0.7440300478556641

Best hyperparameters for Elastic Net with _f_regression dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.0001, l1_ratio=0.0, random_state=7, tol=0.1))]
verbose : False
clf : ElasticNet(alpha=0.0001, l1_ratio=0.0, random_state=7, tol=0.1)
clf__alpha : 0.0001
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 0.0
clf__max_iter : 1000
clf__normalize : False
clf__positive : False
clf__precompute : False
clf__random_state : 7
clf__selection : cyclic
clf__tol : 0.1
clf__warm_start : False

Prediction scores for Elastic Net using _f_regression :
Mean Squared error: 4.841407678763893
Mean Absolute error: 1.3794726052389663
R^2: 0.7516239064916915


  model = cd_fast.enet_coordinate_descent(


In [9]:
#fine tuning of parameters parameters
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.00005, 0.0001, 0.00015, 0.0002, 0.00025),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0, 0.2, 0.05)),
        'clf__random_state':([7]),
        'clf__tol':(0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.14)}

In [10]:
f_reg_model, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = get_best_model(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _f_regression dataset
Fitting 10 folds for each of 7040 candidates, totalling 70400 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    1.9s
[Parallel(n_jobs=6)]: Done 290 tasks      | elapsed:   30.2s
[Parallel(n_jobs=6)]: Done 612 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 1586 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 3386 tasks      | elapsed:  2.2min
[Parallel(n_jobs=6)]: Done 5022 tasks      | elapsed:  3.3min
[Parallel(n_jobs=6)]: Done 7400 tasks      | elapsed:  4.5min
[Parallel(n_jobs=6)]: Done 9440 tasks      | elapsed:  5.7min
[Parallel(n_jobs=6)]: Done 12276 tasks      | elapsed:  7.1min
[Parallel(n_jobs=6)]: Done 14888 tasks      | elapsed:  9.1min
[Parallel(n_jobs=6)]: Done 18416 tasks      | elapsed: 10.5min
[Parallel(n_jobs=6)]: Done 21870 tasks      | elapsed: 12.5min
[Parallel(n_jobs=6)]: Done 25694 tasks      | elapsed: 14.2min
[Parallel(n_jobs=6)]: Done 30818 tasks      | elapsed: 17.0min
[Parallel(n_jobs=6)]: Done 35538 tasks      | elapsed:


Best score for Elastic Net with _f_regression dataset: 0.7440699158288866

Best hyperparameters for Elastic Net with _f_regression dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.00025, l1_ratio=0.0, random_state=7, tol=0.13))]
verbose : False
clf : ElasticNet(alpha=0.00025, l1_ratio=0.0, random_state=7, tol=0.13)
clf__alpha : 0.00025
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 0.0
clf__max_iter : 1000
clf__normalize : False
clf__positive : False
clf__precompute : False
clf__random_state : 7
clf__selection : cyclic
clf__tol : 0.13
clf__warm_start : False

Prediction scores for Elastic Net using _f_regression :
Mean Squared error: 4.841623587093962
Mean Absolute error: 1.3804843913274787
R^2: 0.7516128298645747


  model = cd_fast.enet_coordinate_descent(


In [None]:
# minor decrease in test accuracy, model saved to prevent overfitting
save_model(f_reg_model,sel_methods[0])

<h3>Chi-Squared feature selected dataset<h3>

In [14]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9),
        'clf__random_state':([7]),
        'clf__tol':(0.001, 0.005,0.01,0.02,0.04,0.06,0.08,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)}

In [15]:
chi2_model, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = get_best_model(sel_methods[1], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _chi2 dataset
Fitting 10 folds for each of 97920 candidates, totalling 979200 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 100 tasks      | elapsed:    0.7s
[Parallel(n_jobs=6)]: Done 1300 tasks      | elapsed:    6.9s
[Parallel(n_jobs=6)]: Done 3300 tasks      | elapsed:   16.5s
[Parallel(n_jobs=6)]: Done 6100 tasks      | elapsed:   29.8s
[Parallel(n_jobs=6)]: Done 9700 tasks      | elapsed:   47.1s
[Parallel(n_jobs=6)]: Done 14100 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done 19300 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done 25300 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 32100 tasks      | elapsed:  2.6min
[Parallel(n_jobs=6)]: Done 39700 tasks      | elapsed:  3.2min
[Parallel(n_jobs=6)]: Done 48100 tasks      | elapsed:  3.7min
[Parallel(n_jobs=6)]: Done 57300 tasks      | elapsed:  4.4min
[Parallel(n_jobs=6)]: Done 67300 tasks      | elapsed:  5.2min
[Parallel(n_jobs=6)]: Done 78100 tasks      | elapsed:  6.0min
[Parallel(n_jobs=6)]: Done 89700 tasks      | ela


Best score for Elastic Net with _chi2 dataset: 0.5162854607402666

Best hyperparameters for Elastic Net with _chi2 dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.1, fit_intercept=False, l1_ratio=0.9, normalize=True,
           positive=True, random_state=7, tol=0.6))]
verbose : False
clf : ElasticNet(alpha=0.1, fit_intercept=False, l1_ratio=0.9, normalize=True,
           positive=True, random_state=7, tol=0.6)
clf__alpha : 0.1
clf__copy_X : True
clf__fit_intercept : False
clf__l1_ratio : 0.9
clf__max_iter : 1000
clf__normalize : True
clf__positive : True
clf__precompute : False
clf__random_state : 7
clf__selection : cyclic
clf__tol : 0.6
clf__warm_start : False

Prediction scores for Elastic Net using _chi2 :
Mean Squared error: 9.34086411453992
Mean Absolute error: 1.6182698690898771
R^2: 0.5207907508103667


In [27]:
#poor accuracy, extended parameters
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.00001, 0.0001, 0.001, 0.001, 0.01, 0.0, 1.0, 10.0, 100.0),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0, 5, 0.2)),
        'clf__random_state':([7]),
        'clf__tol':(0.1, 0.3, 0.5, 0.7, 1)}

In [28]:
chi2_model, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = get_best_model(sel_methods[1], args, num_folds, is_sample)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.




Running cross fold validation for Elastic Net with _chi2 dataset
Fitting 10 folds for each of 36000 candidates, totalling 360000 fits


[Parallel(n_jobs=6)]: Done 100 tasks      | elapsed:    0.7s
[Parallel(n_jobs=6)]: Done 706 tasks      | elapsed:   12.9s
[Parallel(n_jobs=6)]: Done 1710 tasks      | elapsed:   18.7s
[Parallel(n_jobs=6)]: Done 3110 tasks      | elapsed:   26.0s
[Parallel(n_jobs=6)]: Done 4984 tasks      | elapsed:   50.5s
[Parallel(n_jobs=6)]: Done 6606 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done 8084 tasks      | elapsed:  3.9min
[Parallel(n_jobs=6)]: Done 9443 tasks      | elapsed:  6.3min
[Parallel(n_jobs=6)]: Done 12174 tasks      | elapsed:  7.3min
[Parallel(n_jobs=6)]: Done 15974 tasks      | elapsed:  7.7min
[Parallel(n_jobs=6)]: Done 20174 tasks      | elapsed:  8.1min
[Parallel(n_jobs=6)]: Done 24528 tasks      | elapsed:  8.7min
[Parallel(n_jobs=6)]: Done 28184 tasks      | elapsed: 12.2min
[Parallel(n_jobs=6)]: Done 32872 tasks      | elapsed: 15.4min
[Parallel(n_jobs=6)]: Done 42748 tasks      | elapsed: 16.6min
[Parallel(n_jobs=6)]: Done 46886 tasks      | elapsed: 20.6min
[P


Best score for Elastic Net with _chi2 dataset: 0.7277697244837357

Best hyperparameters for Elastic Net with _chi2 dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.0, l1_ratio=0.0, positive=True, random_state=7,
           selection='random', tol=0.1))]
verbose : False
clf : ElasticNet(alpha=0.0, l1_ratio=0.0, positive=True, random_state=7,
           selection='random', tol=0.1)
clf__alpha : 0.0
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 0.0
clf__max_iter : 1000
clf__normalize : False
clf__positive : True
clf__precompute : False
clf__random_state : 7
clf__selection : random
clf__tol : 0.1
clf__warm_start : False

Prediction scores for Elastic Net using _chi2 :
Mean Squared error: 5.254872539600692
Mean Absolute error: 1.4527229130594046
R^2: 0.7304121445927596


In [12]:
#fine tuning of parameters parameters
is_sample = False
num_folds = 10
args = {'clf__alpha':(0,0.000001,0.000002,0.000003,0.000004,0.000005),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(0,0.000001,0.000002,0.000003,0.000004,0.000005),
        'clf__random_state':([7]),
        'clf__tol':(0.07,0.08,0.09,0.1,0.11,0.12)}

In [13]:
chi2_model, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = get_best_model(sel_methods[1], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _chi2 dataset
Fitting 10 folds for each of 6912 candidates, totalling 69120 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 tasks      | elapsed:    1.5s
[Parallel(n_jobs=6)]: Done 298 tasks      | elapsed:   35.4s
[Parallel(n_jobs=6)]: Done 744 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done 1379 tasks      | elapsed:  3.1min
[Parallel(n_jobs=6)]: Done 2138 tasks      | elapsed:  4.9min
[Parallel(n_jobs=6)]: Done 3096 tasks      | elapsed:  7.1min
[Parallel(n_jobs=6)]: Done 4370 tasks      | elapsed:  9.1min
[Parallel(n_jobs=6)]: Done 5924 tasks      | elapsed: 11.6min
[Parallel(n_jobs=6)]: Done 7398 tasks      | elapsed: 15.2min
[Parallel(n_jobs=6)]: Done 9080 tasks      | elapsed: 19.0min
[Parallel(n_jobs=6)]: Done 11162 tasks      | elapsed: 22.2min
[Parallel(n_jobs=6)]: Done 14604 tasks      | elapsed: 25.0min
[Parallel(n_jobs=6)]: Done 18500 tasks      | elapsed: 27.5min
[Parallel(n_jobs=6)]: Done 23270 tasks      | elapsed: 30.0min
[Parallel(n_jobs=6)]: Done 28116 tasks      | elapsed: 3


Best score for Elastic Net with _chi2 dataset: 0.7277697244837357

Best hyperparameters for Elastic Net with _chi2 dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0, l1_ratio=0, positive=True, random_state=7,
           selection='random', tol=0.07))]
verbose : False
clf : ElasticNet(alpha=0, l1_ratio=0, positive=True, random_state=7,
           selection='random', tol=0.07)
clf__alpha : 0
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 0
clf__max_iter : 1000
clf__normalize : False
clf__positive : True
clf__precompute : False
clf__random_state : 7
clf__selection : random
clf__tol : 0.07
clf__warm_start : False

Prediction scores for Elastic Net using _chi2 :
Mean Squared error: 5.254872539600692
Mean Absolute error: 1.4527229130594046
R^2: 0.7304121445927596


In [15]:
# hyperparameters shifted slightly but accuracies unchanged, model saved
save_model(chi2_model,sel_methods[1])

<h3>AdaBoost feature selected dataset<h3>

In [29]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.00001, 0.0001, 0.001, 0.001, 0.01, 0.0, 1.0, 10.0, 100.0),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0, 5, 0.2)),
        'clf__random_state':([7]),
        'clf__tol':(0.1, 0.3, 0.5, 0.7, 1)}

In [30]:
ada_model, ada_train_score, ada_mse, ada_mae, ada_r2 = get_best_model(sel_methods[2], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _adaboost dataset
Fitting 10 folds for each of 36000 candidates, totalling 360000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 108 tasks      | elapsed:    1.4s
[Parallel(n_jobs=6)]: Done 706 tasks      | elapsed:   14.1s
[Parallel(n_jobs=6)]: Done 1710 tasks      | elapsed:   20.2s
[Parallel(n_jobs=6)]: Done 3110 tasks      | elapsed:   28.5s
[Parallel(n_jobs=6)]: Done 4984 tasks      | elapsed:   53.4s
[Parallel(n_jobs=6)]: Done 6576 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 8108 tasks      | elapsed:  4.2min
[Parallel(n_jobs=6)]: Done 9568 tasks      | elapsed:  6.6min
[Parallel(n_jobs=6)]: Done 15216 tasks      | elapsed:  7.7min
[Parallel(n_jobs=6)]: Done 21494 tasks      | elapsed:  8.4min
[Parallel(n_jobs=6)]: Done 25642 tasks      | elapsed:  9.2min
[Parallel(n_jobs=6)]: Done 28449 tasks      | elapsed: 12.7min
[Parallel(n_jobs=6)]: Done 33520 tasks      | elapsed: 15.6min
[Parallel(n_jobs=6)]: Done 42200 tasks      | elapsed: 16.4min
[Parallel(n_jobs=6)]: Done 45314 tasks      | elapsed


Best score for Elastic Net with _adaboost dataset: 0.741238992188728

Best hyperparameters for Elastic Net with _adaboost dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.0001, l1_ratio=1.6, random_state=7, tol=0.1))]
verbose : False
clf : ElasticNet(alpha=0.0001, l1_ratio=1.6, random_state=7, tol=0.1)
clf__alpha : 0.0001
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 1.6
clf__max_iter : 1000
clf__normalize : False
clf__positive : False
clf__precompute : False
clf__random_state : 7
clf__selection : cyclic
clf__tol : 0.1
clf__warm_start : False

Prediction scores for Elastic Net using _adaboost :
Mean Squared error: 4.970311483209978
Mean Absolute error: 1.3915036639775946
R^2: 0.7450108250263331


In [17]:
# further hyperparameter exploration
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.00001, 0.0001, 0.001, 0.001, 0.01),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0, 3, 0.2)),
        'clf__random_state':([7]),
        'clf__tol':(0.8,0.9,1,1.1,1.2,1.3,1.5,2,5,10)}

In [18]:
ada_model, ada_train_score, ada_mse, ada_mae, ada_r2 = get_best_model(sel_methods[2], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _adaboost dataset
Fitting 10 folds for each of 24000 candidates, totalling 240000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 tasks      | elapsed:    1.5s
[Parallel(n_jobs=6)]: Done 1132 tasks      | elapsed:    7.2s
[Parallel(n_jobs=6)]: Done 3132 tasks      | elapsed:   16.8s
[Parallel(n_jobs=6)]: Done 5932 tasks      | elapsed:   31.0s
[Parallel(n_jobs=6)]: Done 9532 tasks      | elapsed:   48.3s
[Parallel(n_jobs=6)]: Done 13932 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done 19132 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 25132 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 31932 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done 39532 tasks      | elapsed:  3.1min
[Parallel(n_jobs=6)]: Done 47932 tasks      | elapsed:  3.6min
[Parallel(n_jobs=6)]: Done 54006 tasks      | elapsed:  5.7min
[Parallel(n_jobs=6)]: Done 56936 tasks      | elapsed: 11.1min
[Parallel(n_jobs=6)]: Done 60148 tasks      | elapsed: 15.3min
[Parallel(n_jobs=6)]: Done 71748 tasks      | ela


Best score for Elastic Net with _adaboost dataset: 0.7386751807484371

Best hyperparameters for Elastic Net with _adaboost dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.0001, l1_ratio=0.8, normalize=True, random_state=7, tol=0.8))]
verbose : False
clf : ElasticNet(alpha=0.0001, l1_ratio=0.8, normalize=True, random_state=7, tol=0.8)
clf__alpha : 0.0001
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 0.8
clf__max_iter : 1000
clf__normalize : True
clf__positive : False
clf__precompute : False
clf__random_state : 7
clf__selection : cyclic
clf__tol : 0.8
clf__warm_start : False

Prediction scores for Elastic Net using _adaboost :
Mean Squared error: 5.050528056370888
Mean Absolute error: 1.4084616359489897
R^2: 0.7408955179920332


In [63]:
save_model(ada_model,sel_methods[2])

<h3>Equal business and crime feature selected dataset<h3>

In [31]:
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.00001, 0.0001, 0.001, 0.001, 0.01, 0.0, 1.0, 10.0, 100.0),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0, 5, 0.2)),
        'clf__random_state':([7]),
        'clf__tol':(0.1, 0.3, 0.5, 0.7, 1)}

In [32]:
equal_model, equal_train_score, equal_mse, equal_mae, equal_r2 = get_best_model(sel_methods[3], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _equal_crime_and_business dataset
Fitting 10 folds for each of 36000 candidates, totalling 360000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 107 tasks      | elapsed:    1.0s
[Parallel(n_jobs=6)]: Done 742 tasks      | elapsed:   13.4s
[Parallel(n_jobs=6)]: Done 1746 tasks      | elapsed:   19.2s
[Parallel(n_jobs=6)]: Done 3146 tasks      | elapsed:   26.7s
[Parallel(n_jobs=6)]: Done 4846 tasks      | elapsed:   43.1s
[Parallel(n_jobs=6)]: Done 6661 tasks      | elapsed:  1.7min
[Parallel(n_jobs=6)]: Done 8210 tasks      | elapsed:  4.0min
[Parallel(n_jobs=6)]: Done 9824 tasks      | elapsed:  6.7min
[Parallel(n_jobs=6)]: Done 15570 tasks      | elapsed:  7.3min
[Parallel(n_jobs=6)]: Done 21656 tasks      | elapsed:  8.0min
[Parallel(n_jobs=6)]: Done 25804 tasks      | elapsed:  8.9min
[Parallel(n_jobs=6)]: Done 28902 tasks      | elapsed: 12.7min
[Parallel(n_jobs=6)]: Done 35446 tasks      | elapsed: 15.0min
[Parallel(n_jobs=6)]: Done 42992 tasks      | elapsed: 16.5min
[Parallel(n_jobs=6)]: Done 47106 tasks      | elapsed


Best score for Elastic Net with _equal_crime_and_business dataset: 0.7285933940724345

Best hyperparameters for Elastic Net with _equal_crime_and_business dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.0001, l1_ratio=0.2, random_state=7, tol=0.1))]
verbose : False
clf : ElasticNet(alpha=0.0001, l1_ratio=0.2, random_state=7, tol=0.1)
clf__alpha : 0.0001
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 0.2
clf__max_iter : 1000
clf__normalize : False
clf__positive : False
clf__precompute : False
clf__random_state : 7
clf__selection : cyclic
clf__tol : 0.1
clf__warm_start : False

Prediction scores for Elastic Net using _equal_crime_and_business :
Mean Squared error: 5.134702114199955
Mean Absolute error: 1.4190462571665026
R^2: 0.7365771822835924


In [19]:
# fine tunining
is_sample = False
num_folds = 10
args = {'clf__alpha':(arange(0.00007,0.00013,0.00001),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0.15, 0.25, 0.01)),
        'clf__random_state':([7]),
        'clf__tol':(arange(0.05,0.15, 0.01))}

SyntaxError: invalid syntax (<ipython-input-19-db8abe11bf4f>, line 5)

In [None]:
equal_model, equal_train_score, equal_mse, equal_mae, equal_r2 = get_best_model(sel_methods[3], args, num_folds, is_sample)

In [64]:
save_model(equal_model,sel_methods[3])

<h3>Business only feature selected dataset<h3>

In [37]:
# Business ony produces error on default params therefore reduced set used
is_sample = False
num_folds = 10
args = {'clf__alpha':(0.001, 0.01, 0.0, 1.0),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0.1, 1, 0.1)),
        'clf__random_state':([7]),
        'clf__tol':(0.1, 0.5, 1)}

In [38]:
bus_model, bus_train_score, bus_mse, bus_mae, bus_r2 = get_best_model(sel_methods[4], args, num_folds, is_sample)



Running cross fold validation for Elastic Net with _all_business dataset
Fitting 10 folds for each of 3456 candidates, totalling 34560 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 tasks      | elapsed:    1.5s
[Parallel(n_jobs=6)]: Done 1132 tasks      | elapsed:    7.1s
[Parallel(n_jobs=6)]: Done 3132 tasks      | elapsed:   17.0s
[Parallel(n_jobs=6)]: Done 5932 tasks      | elapsed:   31.5s
[Parallel(n_jobs=6)]: Done 9532 tasks      | elapsed:   49.3s
[Parallel(n_jobs=6)]: Done 13932 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 17846 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 19112 tasks      | elapsed:  3.1min
[Parallel(n_jobs=6)]: Done 20922 tasks      | elapsed:  4.5min
[Parallel(n_jobs=6)]: Done 22745 tasks      | elapsed:  6.0min
[Parallel(n_jobs=6)]: Done 24703 tasks      | elapsed:  7.7min
[Parallel(n_jobs=6)]: Done 28182 tasks      | elapsed:  8.8min
[Parallel(n_jobs=6)]: Done 33622 tasks      | elapsed:  9.3min
[Parallel(n_jobs=6)]: Done 34560 out of 34560 | elapsed:  9.3min finished
  self._final_estimator.fit(Xt, y, **f


Best score for Elastic Net with _all_business dataset: 0.44996200683603715

Best hyperparameters for Elastic Net with _all_business dataset:
memory : None
steps : [('clf', ElasticNet(alpha=0.0, l1_ratio=0.1, normalize=True, random_state=7,
           selection='random', tol=0.1))]
verbose : False
clf : ElasticNet(alpha=0.0, l1_ratio=0.1, normalize=True, random_state=7,
           selection='random', tol=0.1)
clf__alpha : 0.0
clf__copy_X : True
clf__fit_intercept : True
clf__l1_ratio : 0.1
clf__max_iter : 1000
clf__normalize : True
clf__positive : False
clf__precompute : False
clf__random_state : 7
clf__selection : random
clf__tol : 0.1
clf__warm_start : False

Prediction scores for Elastic Net using _all_business :
Mean Squared error: 10.68578982118223
Mean Absolute error: 1.8508237579583582
R^2: 0.45179276195271134


  model = cd_fast.enet_coordinate_descent(


In [None]:
# further hyperparameter exploration
is_sample = False
num_folds = 10
args = {'clf__alpha':(0,0.0001,0.00001),
        'clf__copy_X':(True,False),
        'clf__fit_intercept':(True,False),
        'clf__normalize':(True,False),
        'clf__positive':(True,False),
        'clf__selection':('cyclic','random'),
        'clf__l1_ratio':(arange(0.01,0.1, 0.01)),
        'clf__random_state':([7]),
        'clf__tol':(arange(0.01,0.1, 0.01))}

In [None]:
bus_model, bus_train_score, bus_mse, bus_mae, bus_r2 = get_best_model(sel_methods[4], args, num_folds, is_sample)

In [65]:
save_model(bus_model,sel_methods[4])

<h3>Visualise scores<h3>

In [70]:
training_scores = [f_reg_train_score, chi2_train_score, ada_train_score, equal_train_score, bus_train_score]
mse_scores = [f_reg_mse, chi2_mse, ada_mse, equal_mse, bus_mse]
mae_scores = [f_reg_mae, chi2_mae, ada_mae, equal_mae, bus_mae]
r2_scores = [f_reg_r2, chi2_r2, ada_r2, equal_r2, bus_r2]
datasets = ["F-Regression","Chi-Squared","AdaBoost","Equal Business\nand Crime","All Business"]
y_select = np.arange(len(datasets))

In [10]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree training score")
plt.xlabel("Dataset")
plt.ylabel("Training score (r-squared)")
plt.bar(y_select, training_scores)

NameError: name 'plt' is not defined

In [11]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree testing mean-squared error")
plt.xlabel("Dataset")
plt.ylabel("Mean-squared error")
plt.bar(y_select, mse_scores)

NameError: name 'plt' is not defined

In [12]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree testing mean-absolute error")
plt.xlabel("Dataset")
plt.ylabel("Mean absolute error")
plt.bar(y_select, mae_scores)

NameError: name 'plt' is not defined

In [13]:
plt.figure(figsize = (10,10))
plt.xticks(y_select,datasets)
plt.title("Decision tree testing r-squared scores")
plt.xlabel("Dataset")
plt.ylabel("R-Squared Score")
plt.bar(y_select, r2_scores)

NameError: name 'plt' is not defined