In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [112]:
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)

In [113]:
def support_vec(method, args, _cv, is_sample):
    sample_string = ''
    if (is_sample):
        sample_string = '_sample'
    y_train = pd.read_csv('y_train'+ sample_string + '.csv')    
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    print('\n\nRunning cross fold validation for SVM with',method_string)
    x_train = pd.read_csv('x_train' + method + sample_string + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = Pipeline([('clf', svm.SVR())])
    svm_grid = GridSearchCV(pipe, args, cv = _cv, verbose = 1, scoring = 'r2', n_jobs=6)
    svm_grid.fit(x_train, y_train.values.ravel())
    print('\nBest score for SVM with',method_string,":",svm_grid.best_score_)
    best_args = svm_grid.best_estimator_.get_params()
    print('\nBest score for SVM with',method_string,':')
    for arg in best_args:
        print(arg,":",best_args[arg])
    y_test_predict = svm_grid.predict(x_test)
    mse = mean_squared_error(y_test,y_test_predict)
    mae = mean_absolute_error(y_test,y_test_predict)
    r2 = r2_score(y_test,y_test_predict)
    print('\nPrediction scores for SVM using',method_string,':')
    print('Mean Squared error:',mse)
    print('Mean Absolute error:',mae)
    print('R^2:',r2)
    return best_args, svm_grid.best_score_,mse,mae,r2

In [114]:
print(sel_methods)

['_f_regression' '_chi2' '_adaboost' '_equal_crime_and_business'
 '_all_business']


<h3>F-Regression feature selected dataset<h3>

In [115]:
# exploratory hyperparameters set

In [116]:
args = {'clf__C':(0.01,0.5,1,1.5,2,2.5,3,4,5,6,7,8,9,10),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [117]:
is_sample = True
num_folds = 5
support_vec(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for SVM with _f_regression
Fitting 5 folds for each of 3360 candidates, totalling 16800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   21.6s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  6.2min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  9.8min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 14.1min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 19.2min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 25.0min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 31.3min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 38.3min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 46.4min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 55.6min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 65.3min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed: 76.1min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed: 88.1mi


Best score for SVM with _f_regression : 0.7465188614504423

Best score for SVM with _f_regression :
memory : None
steps : [('clf', SVR(C=10, degree=1, gamma=0.3))]
verbose : False
clf : SVR(C=10, degree=1, gamma=0.3)
clf__C : 10
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : 0.3
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _f_regression :
Mean Squared error: 4.997956016935091
Mean Absolute error: 1.3366867110288125
R^2: 0.7435925926135536


({'memory': None,
  'steps': [('clf', SVR(C=10, degree=1, gamma=0.3))],
  'verbose': False,
  'clf': SVR(C=10, degree=1, gamma=0.3),
  'clf__C': 10,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 0.3,
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.7465188614504423,
 4.997956016935091,
 1.3366867110288125,
 0.7435925926135536)

In [94]:
# Optimal c determined as 10 (the final parameter of the range supplied)
# Further parameters of c require exploration to ensure optimal c
#Remaining hyperparameters set to optimal for quicker results

In [126]:
args = {'clf__C':(10,20,30,40,50,60,70,80,90,100),
           'clf__degree':([1]),
           'clf__gamma':([0.3]),
           'clf__kernel':(['rbf'])}

In [127]:
is_sample = True
num_folds = 5
support_vec(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for SVM with _f_regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   23.7s
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:   29.8s finished



Best score for SVM with _f_regression : 0.746527900097748

Best score for SVM with _f_regression :
memory : None
steps : [('clf', SVR(C=20, degree=1, gamma=0.3))]
verbose : False
clf : SVR(C=20, degree=1, gamma=0.3)
clf__C : 20
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : 0.3
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _f_regression :
Mean Squared error: 5.001849246603076
Mean Absolute error: 1.337356609309337
R^2: 0.7433928603785869


({'memory': None,
  'steps': [('clf', SVR(C=20, degree=1, gamma=0.3))],
  'verbose': False,
  'clf': SVR(C=20, degree=1, gamma=0.3),
  'clf__C': 20,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 0.3,
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.746527900097748,
 5.001849246603076,
 1.337356609309337,
 0.7433928603785869)

In [97]:
# Best c found to be 20.  It stands to reason the best c will be between 11 and 29 with all
# hyperparameters attempted again.  New c range defined in previous args and cross validation ran again

In [128]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [129]:
support_vec(sel_methods[0], args, num_folds, is_sample)



Running cross fold validation for SVM with _f_regression
Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   18.7s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.8min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  7.0min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed: 11.1min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 16.6min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 22.7min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 29.5min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 37.7min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 46.6min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 55.3min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 64.9min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 75.4min


KeyboardInterrupt: 

In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored to another decimal place, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(22,23,24,25,26),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25),
           'clf__kernel':(['rbf'])}

In [None]:
f_reg_best_args, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = support_vec(sel_methods[0], args, num_folds, is_sample)

<h3>Chi-Squared feature selected dataset<h3>

In [118]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [119]:
support_vec(sel_methods[1], args, num_folds, is_sample)



Running cross fold validation for SVM with _chi2
Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   18.8s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  6.4min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed: 10.0min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 14.5min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 19.8min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 25.9min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 32.8min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 40.5min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 49.2min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 58.6min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 68.8min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed: 79.9min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed: 91.8mi


Best score for SVM with _chi2 : 0.745518865197516

Best score for SVM with _chi2 :
memory : None
steps : [('clf', SVR(C=23, degree=1, gamma='auto'))]
verbose : False
clf : SVR(C=23, degree=1, gamma='auto')
clf__C : 23
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : auto
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _chi2 :
Mean Squared error: 5.036382799671845
Mean Absolute error: 1.342463797953253
R^2: 0.74162120436957


({'memory': None,
  'steps': [('clf', SVR(C=23, degree=1, gamma='auto'))],
  'verbose': False,
  'clf': SVR(C=23, degree=1, gamma='auto'),
  'clf__C': 23,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 'auto',
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.745518865197516,
 5.036382799671845,
 1.342463797953253,
 0.74162120436957)

In [None]:
# Degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma as best gamma is auto, all tuning parameters remain, moderate range of c used in case gamma affects it
# 10 fold cross-validation ran to confirm
is_sample = False
num_folds = 10
args = {'clf__C':(21,22,23,24,25),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':(['rbf'])}

In [None]:
chi2_best_args, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = support_vec(sel_methods[1], args, num_folds, is_sample)

<h3>AdaBoost feature selected dataset<h3>

In [120]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [121]:
support_vec(sel_methods[2], args, num_folds, is_sample)



Running cross fold validation for SVM with _adaboost
Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   18.6s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  6.3min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed: 10.0min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 14.4min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 19.7min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 25.7min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 32.5min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 40.1min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 48.6min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 58.1min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 68.0min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed: 78.6min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed: 90.2mi


Best score for SVM with _adaboost : 0.7437728466614654

Best score for SVM with _adaboost :
memory : None
steps : [('clf', SVR(C=14, degree=1, gamma=0.2))]
verbose : False
clf : SVR(C=14, degree=1, gamma=0.2)
clf__C : 14
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : 0.2
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _adaboost :
Mean Squared error: 5.047681002527233
Mean Absolute error: 1.342480649989959
R^2: 0.7410415788401614


({'memory': None,
  'steps': [('clf', SVR(C=14, degree=1, gamma=0.2))],
  'verbose': False,
  'clf': SVR(C=14, degree=1, gamma=0.2),
  'clf__C': 14,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 0.2,
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.7437728466614654,
 5.047681002527233,
 1.342480649989959,
 0.7410415788401614)

In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored to another decimal place, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(12,13,14,15,16),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25),
           'clf__kernel':(['rbf'])}

In [None]:
extra_best_args, extra_train_score, extra_mse, extra_mae, extra_r2 = support_vec(sel_methods[2], args, num_folds, is_sample)

<h3>Equal business and crime feature selected dataset<h3>

In [122]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [123]:
support_vec(sel_methods[3], args, num_folds, is_sample)



Running cross fold validation for SVM with _equal_crime_and_business
Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   18.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  6.2min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  9.9min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 14.2min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 19.4min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 25.4min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 32.2min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 39.8min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 48.2min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 57.5min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 67.5min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed: 78.4min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed: 90.2mi


Best score for SVM with _equal_crime_and_business : 0.7408737223136326

Best score for SVM with _equal_crime_and_business :
memory : None
steps : [('clf', SVR(C=18, degree=1, gamma=1))]
verbose : False
clf : SVR(C=18, degree=1, gamma=1)
clf__C : 18
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : 1
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _equal_crime_and_business :
Mean Squared error: 5.309108875068977
Mean Absolute error: 1.367210699212776
R^2: 0.7276296875010118


({'memory': None,
  'steps': [('clf', SVR(C=18, degree=1, gamma=1))],
  'verbose': False,
  'clf': SVR(C=18, degree=1, gamma=1),
  'clf__C': 18,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 1,
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.7408737223136326,
 5.309108875068977,
 1.367210699212776,
 0.7276296875010118)

In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored to another decimal place, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(10,11,12,13,14),
           'clf__degree':([1]),
           'clf__gamma':('auto',0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35),
           'clf__kernel':(['rbf'])}

In [None]:
equal_best_args, equal_train_score, equal_mse, equal_mae, equal_r2 = support_vec(sel_methods[3], args, num_folds, is_sample)

<h3>All business feature selected dataset<h3>

In [124]:
is_sample = True
num_folds = 5
args = {'clf__C':(11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29),
           'clf__degree':(1,2,3,4,5),
           'clf__gamma':('auto',0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
           'clf__kernel':('linear','poly','rbf','sigmoid')}

In [125]:
support_vec(sel_methods[4], args, num_folds, is_sample)



Running cross fold validation for SVM with _all_business
Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   20.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.7min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  6.7min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed: 10.6min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 15.2min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 20.9min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 27.3min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 34.6min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 42.8min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed: 52.2min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed: 62.1min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed: 73.1min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed: 84.8min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed: 97.5mi


Best score for SVM with _all_business : 0.5276407815228918

Best score for SVM with _all_business :
memory : None
steps : [('clf', SVR(C=29, degree=1, gamma=1))]
verbose : False
clf : SVR(C=29, degree=1, gamma=1)
clf__C : 29
clf__cache_size : 200
clf__coef0 : 0.0
clf__degree : 1
clf__epsilon : 0.1
clf__gamma : 1
clf__kernel : rbf
clf__max_iter : -1
clf__shrinking : True
clf__tol : 0.001
clf__verbose : False

Prediction scores for SVM using _all_business :
Mean Squared error: 9.187937634552277
Mean Absolute error: 1.6419215737017538
R^2: 0.5286362544765659


({'memory': None,
  'steps': [('clf', SVR(C=29, degree=1, gamma=1))],
  'verbose': False,
  'clf': SVR(C=29, degree=1, gamma=1),
  'clf__C': 29,
  'clf__cache_size': 200,
  'clf__coef0': 0.0,
  'clf__degree': 1,
  'clf__epsilon': 0.1,
  'clf__gamma': 1,
  'clf__kernel': 'rbf',
  'clf__max_iter': -1,
  'clf__shrinking': True,
  'clf__tol': 0.001,
  'clf__verbose': False},
 0.5276407815228918,
 9.187937634552277,
 1.6419215737017538,
 0.5286362544765659)

In [None]:
# Fine tuned arguments used to determine final hyperparameters, full training dataset used and cv = 10
# degree set to 1 and kernel rbf as these perfomed best in all SVM sample testing
# gamma range explored further as max gamma selected by algorithm, minor c variance in case gamma changes it
is_sample = False
num_folds = 10
args = {'clf__C':(25,26,27,28,29,30,31,32,33),
           'clf__degree':([1]),
           'clf__gamma':('auto',1,1.5,2,2.5,3,3.5,4,4.5,5),
           'clf__kernel':(['rbf'])}

In [None]:
bus_best_args, bus_train_score, bus_mse, bus_mae, bus_r2 = support_vec(sel_methods[4], args, num_folds, is_sample)