In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

In [17]:
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)

In [18]:
def ada_boost(method):
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    x_train = pd.read_csv('x_train' + method + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    adaBoost = AdaBoostRegressor(
        DecisionTreeRegressor(),
                              n_estimators=500,
                              random_state=13)
    adaBoost.fit(x_train, y_train.values.ravel())
    y_train_prediction = adaBoost.predict(x_train)
    y_test_prediction = adaBoost.predict(x_test)
    print("\n\nResults for Random Forest Regressor on",method_string,"data")
    print("\nMean Squared error")
    print("Train:",mean_squared_error(y_train,y_train_prediction))
    print("Test:",mean_squared_error(y_test,y_test_prediction))
    print("\nMean Absolute error")
    print("Train:",mean_absolute_error(y_train,y_train_prediction))
    print("Test:",mean_absolute_error(y_test,y_test_prediction))
    print("\nR^2")
    print("Train:",r2_score(y_train,y_train_prediction))
    print("Test:",r2_score(y_test,y_test_prediction))

In [19]:
#AdaBoost performs worse than Random Forest Regression but again better without feature selection

In [20]:
for method in sel_methods:
    ada_boost(method)



Results for Random Forest Regressor on no feature selection data

Mean Squared error
Train: 0.04885030065594285
Test: 4.829698790384481

Mean Absolute error
Train: 0.04436900367893235
Test: 1.3390484587929388

R^2
Train: 0.9975031558324688
Test: 0.7522246011962006


Results for Random Forest Regressor on _f_regression data

Mean Squared error
Train: 6.732291870471436
Test: 10.499761208734137

Mean Absolute error
Train: 2.029319623213205
Test: 2.6420795537332493

R^2
Train: 0.6558980504686271
Test: 0.4613364862383773


Results for Random Forest Regressor on _chi2 data

Mean Squared error
Train: 1.3490908718603294
Test: 6.4879449288919515

Mean Absolute error
Train: 0.7516810147456674
Test: 1.8414800235945201

R^2
Train: 0.9310450574583287
Test: 0.6671525053749123


Results for Random Forest Regressor on _mutual_info data

Mean Squared error
Train: 1.6241140595804355
Test: 5.430080715602524

Mean Absolute error
Train: 0.9223060147417261
Test: 1.5532288926069562

R^2
Train: 0.9169880294

In [21]:
def crossfold_validation(method):
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    x_train = pd.read_csv('x_train' + method + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = make_pipeline(AdaBoostRegressor(DecisionTreeRegressor(),
                                           n_estimators=500,
                                           random_state=13))
    pipe.fit(x_train, y_train.values.ravel())
    scores = cross_val_score(estimator = pipe, X = x_train, y = y_train.values.ravel(), cv = 10)
    print('\nCrossfold validation scores for adaboost using',method_string,'data:\n',scores)

In [22]:
for method in sel_methods:
    crossfold_validation(method)


Crossfold validation scores for adaboost using no feature selection data:
 [0.75868326 0.75280385 0.73665439 0.75179852 0.73953363 0.7336269
 0.71328283 0.73988349 0.74180575 0.73185222]

Crossfold validation scores for adaboost using _f_regression data:
 [0.45990836 0.40551138 0.61362183 0.48663225 0.47630154 0.53170096
 0.42258429 0.43192838 0.52793785 0.34308824]

Crossfold validation scores for adaboost using _chi2 data:
 [0.70278466 0.71016459 0.67573405 0.71140753 0.69636485 0.68554142
 0.66262191 0.67820139 0.69318952 0.68798441]

Crossfold validation scores for adaboost using _mutual_info data:
 [0.72021073 0.7062409  0.67587381 0.70626968 0.70451284 0.69306877
 0.66044799 0.68944198 0.69547119 0.69742664]

Crossfold validation scores for adaboost using _equal_crime_and_business data:
 [0.64960854 0.63902124 0.65763407 0.63942307 0.6996477  0.66521315
 0.63531116 0.62644559 0.65802243 0.64197403]

Crossfold validation scores for adaboost using _all_business data:
 [ 4.38458361