In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

In [45]:
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)
model_name = "decision_tree"
display_name = "Decision Tree"

In [46]:
def get_best_model(method, args, _cv, is_sample):
    sample_string = ''
    if (is_sample):
        sample_string = '_sample'
    y_train = pd.read_csv('y_train'+ sample_string + '.csv')    
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    print('\n\nRunning cross fold validation for',display_name,'with',method_string,'dataset')
    x_train = pd.read_csv('x_train' + method + sample_string + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = Pipeline([('clf', DecisionTreeRegressor())])
    grid = GridSearchCV(pipe, args, cv = _cv, verbose = 1, scoring = 'r2', n_jobs=6)
    grid.fit(x_train, y_train.values.ravel())
    print('\nBest score for',display_name,'with',method_string,"dataset:",grid.best_score_)
    best_args = grid.best_estimator_.get_params()
    print('\nBest hyperparameters for',display_name,'with',method_string,'dataset:')
    for arg in best_args:
        print(arg,":",best_args[arg])
    y_test_predict = grid.predict(x_test)
    mse = mean_squared_error(y_test,y_test_predict)
    mae = mean_absolute_error(y_test,y_test_predict)
    r2 = r2_score(y_test,y_test_predict)
    print('\nPrediction scores for',display_name,'using',method_string,':')
    print('Mean Squared error:',mse)
    print('Mean Absolute error:',mae)
    print('R^2:',r2)
    return grid.best_estimator_,grid.best_score_,mse,mae,r2

In [None]:
print(sel_methods)

<h3>F-Regression feature selected dataset<h3>

In [49]:
is_sample = True
num_folds = 5
 args = {'clf__criterion':('mse','friedman_mse','mae','poisson'),
         'clf__splitter':('best','random'),
         'clf__min_samples_split':(2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__min_samples_leaf':(1,2,3,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__max_features':('sqrt','log2'),
         'clf__random_state':([7]),
         'clf__min_impurity_decrease':(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)}

In [None]:
f_reg_model, f_reg_train_score, f_reg_mse, f_reg_mae, f_reg_r2 = get_best_model(sel_methods[0], args, num_folds, is_sample)

<h3>Chi-Squared feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
 args = {'clf__criterion':('mse','friedman_mse','mae','poisson'),
         'clf__splitter':('best','random'),
         'clf__min_samples_split':(2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__min_samples_leaf':(1,2,3,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__max_features':('sqrt','log2'),
         'clf__random_state':([7]),
         'clf__min_impurity_decrease':(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)}

In [None]:
chi2_model, chi2_train_score, chi2_mse, chi2_mae, chi2_r2 = get_best_model(sel_methods[1], args, num_folds, is_sample)

<h3>AdaBoost feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
 args = {'clf__criterion':('mse','friedman_mse','mae','poisson'),
         'clf__splitter':('best','random'),
         'clf__min_samples_split':(2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__min_samples_leaf':(1,2,3,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__max_features':('sqrt','log2'),
         'clf__random_state':([7]),
         'clf__min_impurity_decrease':(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)}

In [50]:
ada_model, ada_train_score, ada_mse, ada_mae, ada_r2 = get_best_model(sel_methods[2], args, num_folds, is_sample)



Running cross fold validation for decision tree with _mutual_info dataset
Fitting 5 folds for each of 85008 candidates, totalling 425040 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 100 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 2420 tasks      | elapsed:    3.7s
[Parallel(n_jobs=6)]: Done 9780 tasks      | elapsed:   11.1s
[Parallel(n_jobs=6)]: Done 32180 tasks      | elapsed:   28.1s
[Parallel(n_jobs=6)]: Done 60980 tasks      | elapsed:   50.6s
[Parallel(n_jobs=6)]: Done 96180 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done 137780 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 185780 tasks      | elapsed:  2.9min
[Parallel(n_jobs=6)]: Done 213804 tasks      | elapsed:  5.9min
[Parallel(n_jobs=6)]: Done 214754 tasks      | elapsed:  7.6min
[Parallel(n_jobs=6)]: Done 215804 tasks      | elapsed:  9.4min
[Parallel(n_jobs=6)]: Done 216954 tasks      | elapsed: 11.3min
[Parallel(n_jobs=6)]: Done 218204 tasks      | elapsed: 12.9min
[Parallel(n_jobs=6)]: Done 219554 tasks      | elapsed: 14.5min
[Parallel(n_jobs=6)]: Done 221004 tasks


Best score for decision tree with _mutual_info dataset: 0.7274892860218688

Best hyperparameters for decision tree with _mutual_info dataset:
memory : None
steps : [('clf', DecisionTreeRegressor(criterion='friedman_mse', max_features='sqrt',
                      min_impurity_decrease=0.1, min_samples_leaf=35,
                      min_samples_split=95, random_state=7))]
verbose : False
clf : DecisionTreeRegressor(criterion='friedman_mse', max_features='sqrt',
                      min_impurity_decrease=0.1, min_samples_leaf=35,
                      min_samples_split=95, random_state=7)
clf__ccp_alpha : 0.0
clf__criterion : friedman_mse
clf__max_depth : None
clf__max_features : sqrt
clf__max_leaf_nodes : None
clf__min_impurity_decrease : 0.1
clf__min_impurity_split : None
clf__min_samples_leaf : 35
clf__min_samples_split : 95
clf__min_weight_fraction_leaf : 0.0
clf__presort : deprecated
clf__random_state : 7
clf__splitter : best

Prediction scores for Decision Tree using _mutual_info

<h3>Equal business and crime feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
 args = {'clf__criterion':('mse','friedman_mse','mae','poisson'),
         'clf__splitter':('best','random'),
         'clf__min_samples_split':(2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__min_samples_leaf':(1,2,3,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__max_features':('sqrt','log2'),
         'clf__random_state':([7]),
         'clf__min_impurity_decrease':(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)}

In [None]:
equal_model, equal_train_score, equal_mse, equal_mae, equal_r2 = get_best_model(sel_methods[3], args, num_folds, is_sample)

exception calling callback for <Future at 0x173165fe490 state=finished returned list>
Traceback (most recent call last):
  File "c:\users\chris\appdata\local\programs\python\python39\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "c:\users\chris\appdata\local\programs\python\python39\lib\site-packages\joblib\parallel.py", line 366, in __call__
    self.parallel.dispatch_next()
  File "c:\users\chris\appdata\local\programs\python\python39\lib\site-packages\joblib\parallel.py", line 799, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "c:\users\chris\appdata\local\programs\python\python39\lib\site-packages\joblib\parallel.py", line 866, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\users\chris\appdata\local\programs\python\python39\lib\site-packages\joblib\parallel.py", line 784, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "c:\users\chris\appdata\

<h3>Business only feature selected dataset<h3>

In [None]:
is_sample = True
num_folds = 5
 args = {'clf__criterion':('mse','friedman_mse','mae','poisson'),
         'clf__splitter':('best','random'),
         'clf__min_samples_split':(2,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__min_samples_leaf':(1,2,3,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100),
         'clf__max_features':('sqrt','log2'),
         'clf__random_state':([7]),
         'clf__min_impurity_decrease':(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)}

In [None]:
bus_model, bus_train_score, bus_mse, bus_mae, bus_r2 = get_best_model(sel_methods[4], args, num_folds, is_sample)

In [None]:
#  argument for pruning
#  'clf__ccp_alpha':(0,0.01,0.02,0.05,0.1,0.2,0.3,0.4,0.5),