#### Training Sample: train.csv
#### Evaluation Sample: validation_under.csv
#### Method: OOB
#### Output: Best hyperparameters; Pr-curve; ROC AUC

# Training Part 

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,classification_report, matthews_corrcoef, accuracy_score, average_precision_score, roc_auc_score

#### Input data is read and named as the following

In [None]:
transactions = pd.read_csv('train.csv')
X_train = transactions.drop(labels='Class', axis=1)
y_train = transactions.loc[:,'Class']

#### Tuning parameters

In [None]:
test = 0
rf = RandomForestClassifier(n_jobs=-1, random_state=1)

if test== 0:
    n_estimators = [75,150,800,1000,1200]
    min_samples_split = [2, 5]
    min_samples_leaf = [1, 5]
else:
    n_estimators = [70]
    min_samples_split = [2]
    min_samples_leaf = [1]

param_grid_rf = {'n_estimators': n_estimators,
                 'min_samples_split': min_samples_split,
                 'min_samples_leaf': min_samples_split,
                 'oob_score': [True]
                }

In [None]:
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf,cv = 5,
                       n_jobs=-1, pre_dispatch='2*n_jobs', verbose=1, return_train_score=False)


In [None]:
grid_rf.fit(X_train, y_train)

#### The best score and the estimator

In [None]:
grid_rf.best_score_

In [None]:
grid_rf.best_params_

# Evaluation Part 

In [None]:
evaluation = pd.read_csv('validation_under.csv')
X_eval = evaluation.drop(labels='Class', axis=1)
y_eval = evaluation.loc[:,'Class']

In [None]:
def Random_Forest_eval(estimator, X_test, y_test):
    
    y_pred = estimator.predict(X_test)

    print('Classification Report')
    print(classification_report(y_test, y_pred))
    if y_test.nunique() <= 2:
        try:
            y_score = estimator.predict_proba(X_test)[:,1]
        except:
            y_score = estimator.decision_function(X_test)
        print('AUPRC', average_precision_score(y_test, y_score))
        print('AUROC', roc_auc_score(y_test, y_score))
    

In [None]:
Random_Forest_eval(grid_rf, X_eval, y_eval)

### Precision Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve
precision_rf,recall_rf,thresholds_rf = precision_recall_curve(
    y_eval, grid_rf.predict_proba(X_eval)[:,1])

In [None]:
import matplotlib.pyplot as plt
close_default_rf = np.argmin(np.abs(thresholds_rf * 0.5))
plt.plot(precision_rf,recall_rf,label="rf")
plt.plot(precision_rf[close_default_rf],recall_rf[close_default_rf],'^',c='k',
        label = "threashold 0.5 rf")
plt.xlabel("precision")
plt.ylabel("recall")
plt.legend(loc="best")
plt.title("PR_Curve")

### Area Under the Receiver Operating Characteristic Curve

In [None]:
from sklearn.metrics import roc_curve
fpr_rf,tpr_rf,th_rf = roc_curve(
    y_eval, grid_rf.predict_proba(X_eval)[:,1])

In [None]:
close_default_rf = np.argmin(np.abs(th_rf * 0.5))
plt.plot(fpr_rf,tpr_rf,label="rf")
plt.plot(fpr_rf[close_default_rf],tpr_rf[close_default_rf],'^',c='k',
        label = "threashold 0.5 rf")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="best")
plt.title("ROC")