# Training Part 

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,classification_report, matthews_corrcoef, accuracy_score, average_precision_score, roc_auc_score

#### Input data is read and named as the following

In [2]:
transactions = pd.read_csv('../Data/train.csv')
X_train = transactions.drop(labels='Class', axis=1)
y_train = transactions.loc[:,'Class']

In [3]:
num_folds = 5
MCC_scorer = make_scorer(matthews_corrcoef)

#### Tuning parameters

In [4]:
rf = RandomForestClassifier(n_jobs=-1, random_state=1)

n_estimators = [50, 75, 500]  #default = 50;
# ,50, 60, 90, 105, 120, 500, 1000
min_samples_split = [2, 5] # default=2
# , 5, 10, 15, 100
min_samples_leaf = [1, 5]  # default = 1
# , 2, 5, 8

param_grid_rf = {'n_estimators': n_estimators,
                 'min_samples_split': min_samples_split,
                 'min_samples_leaf': min_samples_split,
                }

In [5]:
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf,cv=num_folds,scoring = MCC_scorer, 
                       n_jobs=-1, pre_dispatch='2*n_jobs', verbose=1, return_train_score=False)


In [6]:
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 50.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 71.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False, random_state=1,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'min_s

#### The best score and the estimator

In [7]:
grid_rf.best_score_

0.8450074502558784

In [8]:
grid_rf.best_params_

{'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 75}

# Evaluation Part 

In [10]:
evaluation = pd.read_csv('../Data/validation.csv')
X_eval = evaluation.drop(labels='Class', axis=1)
y_eval = evaluation.loc[:,'Class']

In [11]:
def Random_Forest_eval(estimator, X_test, y_test):
    
    y_pred = estimator.predict(X_test)

    print('Classification Report')
    print(classification_report(y_test, y_pred))
    if y_test.nunique() <= 2:
        try:
            y_score = estimator.predict_proba(X_test)[:,1]
        except:
            y_score = estimator.decision_function(X_test)
        print('AUPRC', average_precision_score(y_test, y_score))
        print('AUROC', roc_auc_score(y_test, y_score))
    

In [12]:
Random_Forest_eval(grid_rf, X_eval, y_eval)

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     99511
           1       0.93      0.77      0.85       173

    accuracy                           1.00     99684
   macro avg       0.97      0.89      0.92     99684
weighted avg       1.00      1.00      1.00     99684

AUPRC 0.8229572995162316
AUROC 0.9370532307608482
