# Training Part 

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,classification_report, matthews_corrcoef, accuracy_score, average_precision_score, roc_auc_score

#### Input data is read and named as the following

In [5]:
transactions = pd.read_csv('../Data/train.csv')
X_train = transactions.drop(labels='Class', axis=1)
y_train = transactions.loc[:,'Class']

#### No need to do data rescale for the tree-based classfication.
#### Instead of doing a grid search, we specify the number of estimators. 

In [6]:
num_folds = 5
MCC_scorer = make_scorer(matthews_corrcoef)

In [7]:
pipeline_rf = Pipeline([
    ('model', RandomForestClassifier(n_jobs=-1, random_state=1))
])

param_grid_rf = {'model__n_estimators': [75]}

###### Matthews correlation coefficient or Phi coefficient.
###### Value[-1,1], where 1 is for perfect, 0 is random and -1 is inverse. 
###### pre_dispatch: -2 
###### use 5 fold validation

In [8]:
grid_rf = GridSearchCV(estimator=pipeline_rf, param_grid=param_grid_rf, scoring=MCC_scorer, 
                       n_jobs=-1, pre_dispatch='2*n_jobs', cv=5, verbose=1, return_train_score=False)

In [9]:
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('model',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                                               min_samples_leaf=1,
                                                               min_samples_split=2,
                    

#### The best score and the estimator; which was tested to be 75

In [10]:
grid_rf.best_score_

0.8450458320744916

In [11]:
grid_rf.best_params_

{'model__n_estimators': 75}

# Evaluation Part 

In [12]:
evaluation = pd.read_csv('../Data/validation.csv')
X_eval = evaluation.drop(labels='Class', axis=1)
y_eval = evaluation.loc[:,'Class']

In [13]:
def Random_Forest_eval(estimator, X_test, y_test):
    
    y_pred = estimator.predict(X_test)

    print('CLASSIFICATION REPORT')
    print(classification_report(y_test, y_pred))
    if y_test.nunique() <= 2: # Additional metrics for binary classification
        try:
            y_score = estimator.predict_proba(X_test)[:,1]
        except:
            y_score = estimator.decision_function(X_test)
        print('AUPRC', average_precision_score(y_test, y_score))
        print('AUROC', roc_auc_score(y_test, y_score))
    

In [14]:
Random_Forest_eval(grid_rf, X_eval, y_eval)

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     99511
           1       0.94      0.78      0.85       173

    accuracy                           1.00     99684
   macro avg       0.97      0.89      0.93     99684
weighted avg       1.00      1.00      1.00     99684

AUPRC 0.8245235462105749
AUROC 0.9322802376453226
