### Import data

In [2]:
import pandas as pd

features = pd.read_csv('databases/rf_Database_2_training.csv')
features.head(5)

Unnamed: 0,outperf vs index 1m,outperf vs index 3m,ST momentum,LT momentum,Index vs max,Index st momentum,ebit vs peak,net income vs peak,market cap vs peak,Margin vs peak,PE,xEbit,xSales,PE vs peak,xEbit vs peak,xSales vs peak,ND/market cap,outperf next month
0,0.002703,0.142054,-0.024754,-0.026957,-0.2,0.017742,0.0,0.0,-0.046248,0.0,8.164666,4.25831,0.201669,-0.222973,-0.241666,-0.077194,-0.420316,0
1,-0.034843,-0.099532,0.428953,0.278566,-0.2,0.017742,0.0,0.0,-0.035664,0.0,3.338716,3.232125,0.22863,-0.560311,-0.33966,-0.170085,-0.291897,1
2,-0.005101,-0.262568,-0.014076,-0.078449,-0.2,0.017742,-1.518357,-1.629189,-0.228261,-1.569135,-19.416503,-15.838593,0.439608,-2.239239,-2.485813,-0.2823,-0.248044,0
3,0.009696,-0.052876,0.075785,0.161043,0.0,0.017742,0.0,0.0,-0.027157,0.0,20.273615,13.099569,2.075199,-0.087932,-0.099589,-0.083936,-0.146034,0
4,0.011575,-0.134856,0.118982,0.216946,-0.2,0.017742,0.0,0.0,-0.09311,0.0,17.850217,10.38939,2.263279,-0.102874,-0.268257,-0.22304,-0.021275,1


### process database

In [3]:
#Process database
import numpy as np

#save labels
labels = np.array(features['outperf next month'])
#remove labels
features=features.drop('outperf next month', axis = 1)
#save features names
feature_list = list(features.columns)
#convert to numpy array
features = np.array(features)

### Split database in training and cross validation sets

In [29]:
from sklearn.model_selection import train_test_split
#stratify = labels allows for consistant class distribution between sets
X_train, X_cv, y_train, y_cv = train_test_split(features, labels, stratify = labels)

(2267, 17)



### Show distribution


In [18]:
train_zeros = (y_train == 0).sum()
train_ones = (y_train == 1).sum()
train_pct = train_ones / (train_zeros + train_ones)
print('Percentage of ones in train set is', train_pct )

cv_zeros = (y_cv == 0).sum()
cv_ones = (y_cv == 1).sum()
cv_pct = cv_ones / (cv_zeros + cv_ones)
print('Percentage of ones in cv set is', cv_pct )


Percentage of ones in train set is 0.36082928981032203
Percentage of ones in cv set is 0.3611111111111111


### Setup Grid search

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

import matplotlib.pyplot as plt
plt.style.use("ggplot")


clf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    'min_samples_split': [2, 5, 10, 20, 40], 
    'n_estimators' : [50, 100, 200, 400],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 17]
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}



def grid_search_wrapper(refit_score='recall_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(X_cv)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the cv data.
    print('\nConfusion matrix of Random Forest optimized for {} on the cv data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_cv, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search



### Run grid search

In [39]:
grid_search_clf = grid_search_wrapper(refit_score='recall_score')


Best params for recall_score
{'max_depth': 25, 'max_features': 17, 'min_samples_split': 5, 'n_estimators': 50}

Confusion matrix of Random Forest optimized for recall_score on the cv data:
     pred_neg  pred_pos
neg       392        91
pos       205        68


### Display results

In [40]:
results = pd.DataFrame(grid_search_clf.cv_results_)
results.to_csv('tuning reports/rf_tuning.csv')
results = results.sort_values(by='mean_test_recall_score', ascending=False)
results[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_accuracy_score', 'param_max_depth', 'param_max_features', 'param_min_samples_split', 'param_n_estimators']].round(3).head()


Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators
304,0.377,0.237,0.581,25,17,5,50
240,0.373,0.23,0.582,25,3,2,50
281,0.381,0.23,0.586,25,10,2,100
284,0.373,0.229,0.582,25,10,5,50
260,0.371,0.227,0.58,25,5,2,50
