### Import data

In [1]:
import pandas as pd

features = pd.read_csv('databases/rf_Database_2_3m.csv')
features.head(5)

Unnamed: 0,outperf vs index 1m,outperf vs index 3m,ST momentum,LT momentum,Index vs max,Index st momentum,ebit vs peak,net income vs peak,market cap vs peak,Margin vs peak,PE,xEbit,xSales,PE vs peak,xEbit vs peak,xSales vs peak,ND/market cap,outperf next 3 months
0,0.007191,-0.083593,0.076685,-0.472196,-0.2,0.017742,0.0,0.0,-0.042669,-0.016447,11.096157,8.764507,0.390003,-0.195909,-0.040863,-0.047339,0.046168,0
1,-0.017525,0.071289,0.00915,-0.346791,-0.2,0.017742,-0.266321,-0.496339,-0.051437,-0.247309,17.685318,8.772884,0.592203,-0.051437,-0.043755,-0.218866,0.185074,0
2,-0.002132,0.001588,0.024445,0.272849,-0.2,0.017742,0.0,0.0,-0.035756,0.0,19.453475,17.37721,0.837097,-0.235442,-0.16842,-0.027351,0.318709,0
3,0.044802,0.041664,0.2048,0.556955,-0.2,0.017742,0.0,0.0,0.0,-0.045119,12.874137,10.429705,0.975562,0.0,-0.008045,-0.052801,0.498034,0
4,0.001149,0.041071,0.026255,0.251466,-0.2,0.017742,0.0,0.0,-0.032763,0.0,8.221638,4.630687,0.691503,-0.066863,-0.132443,-0.060985,0.141099,0


### process database

In [2]:
#Process database
import numpy as np

#save labels
labels = np.array(features['outperf next  3 months'])
#remove labels
features=features.drop('outperf next  3 months', axis = 1)
#save features names
feature_list = list(features.columns)
#convert to numpy array
features = np.array(features)

### Split database in training and cross validation sets

In [3]:
from sklearn.model_selection import train_test_split
#stratify = labels allows for consistant class distribution between sets
X_train, X_cv, y_train, y_cv = train_test_split(features, labels, stratify = labels)


### Show distribution


In [4]:
train_zeros = (y_train == 0).sum()
train_ones = (y_train == 1).sum()
train_pct = train_ones / (train_zeros + train_ones)
print('Percentage of ones in train set is', train_pct )

cv_zeros = (y_cv == 0).sum()
cv_ones = (y_cv == 1).sum()
cv_pct = cv_ones / (cv_zeros + cv_ones)
print('Percentage of ones in cv set is', cv_pct )


Percentage of ones in train set is 0.418554476806904
Percentage of ones in cv set is 0.4191810344827586


### Setup Grid search

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

import matplotlib.pyplot as plt
plt.style.use("ggplot")


clf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    'min_samples_split': [2, 5, 10, 20, 40], 
    'n_estimators' : [50, 100, 200, 400],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 17]
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}



def grid_search_wrapper(refit_score='accuracy_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(X_cv)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the cv data.
    print('\nConfusion matrix of Random Forest optimized for {} on the cv data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_cv, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search



### Run grid search

In [8]:
grid_search_clf = grid_search_wrapper(refit_score='accuracy_score')


Best params for accuracy_score
{'max_depth': 25, 'max_features': 17, 'min_samples_split': 5, 'n_estimators': 400}

Confusion matrix of Random Forest optimized for accuracy_score on the cv data:
     pred_neg  pred_pos
neg       431       108
pos       152       237


### Display results

In [9]:
results = pd.DataFrame(grid_search_clf.cv_results_)
results.to_csv('tuning reports/rf_tuning.csv')
results = results.sort_values(by='mean_test_recall_score', ascending=False)
results[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_accuracy_score', 'param_max_depth', 'param_max_features', 'param_min_samples_split', 'param_n_estimators']].round(3).head()


Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators
302,0.672,0.587,0.707,25,17,2,200
284,0.669,0.586,0.705,25,10,5,50
303,0.672,0.583,0.706,25,17,2,400
307,0.68,0.583,0.711,25,17,5,400
306,0.669,0.579,0.704,25,17,5,200
