In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [6]:
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)

In [7]:
def support_vec(method, args, _cv, is_sample):
    sample_string = ''
    if (is_sample):
        sample_string = '_sample'
    y_train = pd.read_csv('y_train'+ sample_string + '.csv')    
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    print('\n\nRunning cross fold validation for SVM with',method_string)
    x_train = pd.read_csv('x_train' + method + sample_string + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = Pipeline([('clf', svm.SVR())])
    svm_grid = GridSearchCV(pipe, args, cv = _cv, verbose = 1, scoring = 'r2', n_jobs=6)
    svm_grid.fit(x_train, y_train.values.ravel())
    print('\nBest score for SVM with',method_string,":",svm_grid.best_score_)
    best_args = svm_grid.best_estimator_.get_params()
    print('\nBest score for SVM with',method_string,':')
    for arg in best_args:
        print(arg,":",best_args[arg])
    y_test_predict = svm_grid.predict(x_test)
    mse = mean_squared_error(y_test,y_test_predict)
    mae = mean_absolute_error(y_test,y_test_predict)
    r2 = r2_score(y_test,y_test_predict)
    print('\nPrediction scores for SVM using',method_string,':')
    print('Mean Squared error:',mse)
    print('Mean Absolute error:',mae)
    print('R^2:',r2)
    return best_args, svm_grid.best_score_,mse,mae,r2

In [8]:
print(sel_methods)

['_f_regression' '_chi2' '_adaboost' '_equal_crime_and_business'
 '_all_business']


<h3>F-Regression feature selected dataset<h3>

In [9]:
# exploratory hyperparameters set

In [10]:
args = {'clf__C':(0.01,0.5,1,1.5,2,2.5,3,4,5,6,7,8,9,10),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [11]:
is_sample = True
num_folds = 5
support_vec(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for SVM with _f_regression
Fitting 5 folds for each of 3360 candidates, totalling 16800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   20.5s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  6.1min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  9.7min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 14.0min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 19.0min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 24.7min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 31.4min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 38.7min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 46.7min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 55.8min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 66.1min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed: 77.6min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed: 89.3mi


Best score for SVM with _f_regression : 0.7465188614504423

Best score for SVM with _f_regression :
memory : None
steps : [('clf', SVR(C=10, degree=1, gamma=0.3))]
verbose : False
clf : SVR(C=10, degree=1, gamma=0.3)
clf__C : 10
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : 0.3
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _f_regression :
Mean Squared error: 4.997956016935091
Mean Absolute error: 1.3366867110288125
R^2: 0.7435925926135536


({'memory': None,
  'steps': [('clf', SVR(C=10, degree=1, gamma=0.3))],
  'verbose': False,
  'clf': SVR(C=10, degree=1, gamma=0.3),
  'clf__C': 10,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 0.3,
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.7465188614504423,
 4.997956016935091,
 1.3366867110288125,
 0.7435925926135536)

In [None]:
# Optimal c determined as 10 (the final parameter of the range supplied)
# Further parameters of c require exploration to ensure optimal c
#Remaining hyperparameters set to optimal for quicker results

In [None]:
args = {'clf__C':(10,20,30,40,50,60,70,80,90,100),
           'clf__degree':([1]),
           'clf__gamma':([0.3]),
           'clf__kernel':(['rbf'])}

In [None]:
is_sample = True
num_folds = 5
support_vec(sel_methods[0], args, num_folds, is_sample)

In [None]:
# Best c found to be 20.  It stands to reason the best c will be between 11 and 29 with all
# hyperparameters attempted again.  New c range defined in previous args and cross validation ran again

In [None]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [None]:
support_vec(sel_methods[0], args, num_folds, is_sample)

In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored to another decimal place, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(22,23,24,25,26),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25),
           'clf__kernel':(['rbf'])}

In [None]:
f_reg_best_args, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = support_vec(sel_methods[0], args, num_folds, is_sample)

<h3>Chi-Squared feature selected dataset<h3>

In [12]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [13]:
support_vec(sel_methods[1], args, num_folds, is_sample)



Running cross fold validation for SVM with _chi2
Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   21.5s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  4.2min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  7.3min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed: 11.6min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 16.9min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 22.9min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 29.6min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 37.3min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 45.8min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 55.6min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 66.1min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 77.4min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed: 89.4min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed: 102.5m


Best score for SVM with _chi2 : 0.73684493877563

Best score for SVM with _chi2 :
memory : None
steps : [('clf', SVR(C=16, degree=1, gamma=0.3))]
verbose : False
clf : SVR(C=16, degree=1, gamma=0.3)
clf__C : 16
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : 0.3
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _chi2 :
Mean Squared error: 5.329514995094013
Mean Absolute error: 1.3735301409165281
R^2: 0.7265828034722794


({'memory': None,
  'steps': [('clf', SVR(C=16, degree=1, gamma=0.3))],
  'verbose': False,
  'clf': SVR(C=16, degree=1, gamma=0.3),
  'clf__C': 16,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 0.3,
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.73684493877563,
 5.329514995094013,
 1.3735301409165281,
 0.7265828034722794)

In [None]:
# Degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma as best gamma is auto, all tuning parameters remain, moderate range of c used in case gamma affects it
# 10 fold cross-validation ran to confirm
is_sample = False
num_folds = 10
args = {'clf__C':(21,22,23,24,25),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':(['rbf'])}

In [None]:
chi2_best_args, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = support_vec(sel_methods[1], args, num_folds, is_sample)

<h3>AdaBoost feature selected dataset<h3>

In [16]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [None]:
support_vec(sel_methods[2], args, num_folds, is_sample)



Running cross fold validation for SVM with _adaboost
Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   21.6s


In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored to another decimal place, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(12,13,14,15,16),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25),
           'clf__kernel':(['rbf'])}

In [None]:
ada_best_args, ada_train_score, ada_mse, ada_mae, ada_r2 = support_vec(sel_methods[2], args, num_folds, is_sample)

<h3>Equal business and crime feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [None]:
support_vec(sel_methods[3], args, num_folds, is_sample)

In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored to another decimal place, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(10,11,12,13,14),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35),
           'clf__kernel':(['rbf'])}

In [None]:
equal_best_args, equal_train_score, equal_mse, equal_mae, equal_r2 = support_vec(sel_methods[3], args, num_folds, is_sample)

<h3>All business feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [None]:
support_vec(sel_methods[4], args, num_folds, is_sample)

In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored further as max gamma selected by algorithm, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(25,26,27,28,29,30,31,32,33),
           'clf__degree':([1]),
           'clf__gamma':('auto',1,1.5,2,2.5,3,3.5,4,4.5,5),
           'clf__kernel':(['rbf'])}

In [None]:
bus_best_args, bus_train_score, bus_mse, bus_mae, bus_r2 = support_vec(sel_methods[4], args, num_folds, is_sample)