In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

In [26]:
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')
file = open("Selection Methods","rb")
sel_methods = np.load(file)

In [27]:
def random_forest(method):
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    x_train = pd.read_csv('x_train' + method + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    forest = RandomForestRegressor(n_estimators=500, criterion='mse', random_state=13, n_jobs=1)
    forest.fit(x_train, y_train.values.ravel())
    y_train_prediction = forest.predict(x_train)
    y_test_prediction = forest.predict(x_test)
    print("\n\nResults for Random Forest Regressor on",method_string,"data")
    print("\nMean Squared error")
    print("Train:",mean_squared_error(y_train,y_train_prediction))
    print("Test:",mean_squared_error(y_test,y_test_prediction))
    print("\nMean Absolute error")
    print("Train:",mean_absolute_error(y_train,y_train_prediction))
    print("Test:",mean_absolute_error(y_test,y_test_prediction))
    print("\nR^2")
    print("Train:",r2_score(y_train,y_train_prediction))
    print("Test:",r2_score(y_test,y_test_prediction))

In [28]:
#The full data set appears to generate better results than any of the feature selected sets

In [29]:
for method in sel_methods:
    random_forest(method)



Results for Random Forest Regressor on no feature selection data

Mean Squared error
Train: 0.670349552873071
Test: 4.687199571243687

Mean Absolute error
Train: 0.5104909474622876
Test: 1.370880189915308

R^2
Train: 0.9657369893567976
Test: 0.7595351607951005


Results for Random Forest Regressor on _f_regression data

Mean Squared error
Train: 0.8148255366428317
Test: 4.983933916019172

Mean Absolute error
Train: 0.5980170453336584
Test: 1.4310714436314256

R^2
Train: 0.9583525103959714
Test: 0.7443119607972231


Results for Random Forest Regressor on _chi2 data

Mean Squared error
Train: 0.7177195243833939
Test: 5.0480450027701105

Mean Absolute error
Train: 0.5382471357184387
Test: 1.3877181358940358

R^2
Train: 0.9633158080028753
Test: 0.7410229047345376


Results for Random Forest Regressor on _mutual_info data

Mean Squared error
Train: 0.7182549251866022
Test: 5.064574375511041

Mean Absolute error
Train: 0.5389378955809506
Test: 1.3890684843521097

R^2
Train: 0.9632884425137

In [30]:
#Validate with k-fold cross validation (k=10)

In [31]:
def crossfold_validation(method):
    method_string = method
    if (method == ''):
        method_string = "no feature selection"
    x_train = pd.read_csv('x_train' + method + '.csv').values
    x_test = pd.read_csv('x_test' + method + '.csv').values
    pipe = make_pipeline(RandomForestRegressor(n_estimators=500, criterion='mse', random_state=13, n_jobs=1))
    pipe.fit(x_train, y_train.values.ravel())
    scores = cross_val_score(estimator = pipe, X = x_train, y = y_train.values.ravel(), cv = 10)
    print('\nCrossfold validation scores for random forest using',method_string,'data:\n',scores)

In [32]:
for method in sel_methods:
    crossfold_validation(method)


Crossfold validation scores for random forest using no feature selection data:
 [0.76596307 0.74086619 0.74361372 0.75857373 0.75145467 0.7385967
 0.71617983 0.74483093 0.75417892 0.73856684]

Crossfold validation scores for random forest using _f_regression data:
 [0.74688858 0.73014377 0.7275608  0.74403924 0.73235992 0.73128291
 0.70124738 0.72891171 0.74440241 0.72475993]

Crossfold validation scores for random forest using _chi2 data:
 [0.74825182 0.74166207 0.72821358 0.74606755 0.73805507 0.73347264
 0.70732424 0.73479055 0.74650456 0.7376267 ]

Crossfold validation scores for random forest using _mutual_info data:
 [0.75231631 0.74709773 0.72972416 0.74502389 0.7386253  0.73271906
 0.70768336 0.73417898 0.73655907 0.73735986]

Crossfold validation scores for random forest using _equal_crime_and_business data:
 [0.74238381 0.74659778 0.71142167 0.7343563  0.73533209 0.72491051
 0.7033421  0.72407431 0.72245512 0.72652156]

Crossfold validation scores for random forest using _al