# Model training



In [1]:
import preprocessing
import pandas as pd

# import the data and preprocess
train_data = pd.read_csv('creditcard_train.csv')
test_data = pd.read_csv('creditcard_test.csv')

# preprocessing as per outlined in the EDA section
X, y= preprocessing.preproc(train_data)
X_test, y_test = preprocessing.preproc(test_data)

### Basic logistic regression
We start with a simple logistic regression model as a baseline. <br> 
We use logistic regression hyperparameter C=100 (controlling the power of L2 regularisation)
We will try different sampling methods, without sampling, BorderlineSMOTE, random undersampling, BorderlineSMOTE with random undersampling. <br>

#### Without resampling

In [2]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, classification_report
from metrics import pr_auc

In [33]:

def lr_model(X_train, y_train, X_val, y_val, X_test, y_test):
    model = LogisticRegression(max_iter=1e4, C=1e2)
    model.fit(X_train, y_train)
    y_train_pre = model.predict(X_train)
    y_val_pre = model.predict(X_val)
    y_test_pre = model.predict(X_test)

    print(classification_report(y_train, y_train_pre))
    print('---------------------------------------------------------------------')
    print(classification_report(y_val, y_val_pre))
    print('---------------------------------------------------------------------')
    print(classification_report(y_test, y_test_pre))
    print('---------------------------------------------------------------------')

    print('Train pr_auc score', pr_auc(y_train, y_train_pre))
    print('Validation pr_auc, score', pr_auc(y_val, y_val_pre))
    print('Test pr_auc score', pr_auc(y_test, y_test_pre))
    

In [15]:

# split the training into training and validation sets
# using stratified method, meaning the distributions of target classes are the same across train and validation.
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42)

lr_model(X_train, y_train, X_val, y_val, X_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    159200
           1       0.92      0.73      0.82       291

    accuracy                           1.00    159491
   macro avg       0.96      0.87      0.91    159491
weighted avg       1.00      1.00      1.00    159491

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39800
           1       0.93      0.71      0.81        73

    accuracy                           1.00     39873
   macro avg       0.96      0.86      0.90     39873
weighted avg       1.00      1.00      1.00     39873

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.87      0.68      0.76       128

    accuracy                           1.0

The model is performing OK. Train and validation score is very similar. Recall score is less good as recision.

#### With SMOTE and BorderlineSMOTE

In [34]:
# with SMOTE
from imblearn.over_sampling import SMOTE
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42)

# over sample the minority
smote = SMOTE(random_state=42, sampling_strategy=0.2)
X_train, y_train = smote.fit_resample(X_train, y_train)

lr_model(X_train, y_train, X_val, y_val, X_test, y_test)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    159200
           1       0.97      0.94      0.96     31840

    accuracy                           0.99    191040
   macro avg       0.98      0.97      0.97    191040
weighted avg       0.99      0.99      0.99    191040

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     39800
           1       0.22      0.85      0.35        73

    accuracy                           0.99     39873
   macro avg       0.61      0.92      0.67     39873
weighted avg       1.00      0.99      1.00     39873

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     85315
           1       0.17      0.83      0.28       128

    accuracy                           0.9

In [11]:
# with borderlineSMOTE
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42)

# over sample the minority
smote = BorderlineSMOTE(random_state=42, sampling_strategy=0.2)
X_train, y_train = smote.fit_resample(X_train, y_train)

lr_model(X_train, y_train, X_val, y_val, X_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    159200
           1       0.99      0.99      0.99     31840

    accuracy                           1.00    191040
   macro avg       1.00      1.00      1.00    191040
weighted avg       1.00      1.00      1.00    191040

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39800
           1       0.44      0.73      0.55        73

    accuracy                           1.00     39873
   macro avg       0.72      0.86      0.77     39873
weighted avg       1.00      1.00      1.00     39873

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.39      0.71      0.50       128

    accuracy                           1.0

As you can see the validation and test score got significantly worse. The reason could be that the artificially generated minority samples aren't helping the model to be more generalised, and caused precision to dip.

#### Undersample

In [13]:
from imblearn.under_sampling import RandomUnderSampler
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42)

# over sample the minority
under = RandomUnderSampler(random_state=42, sampling_strategy=0.2)
X_train, y_train = under.fit_resample(X_train, y_train)

lr_model(X_train, y_train, X_val, y_val, X_test, y_test)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1455
           1       0.98      0.90      0.94       291

    accuracy                           0.98      1746
   macro avg       0.98      0.95      0.96      1746
weighted avg       0.98      0.98      0.98      1746

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     39800
           1       0.19      0.89      0.31        73

    accuracy                           0.99     39873
   macro avg       0.59      0.94      0.65     39873
weighted avg       1.00      0.99      1.00     39873

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     85315
           1       0.17      0.85      0.28       128

    accuracy                           0.9

Precision calapsed under random under sampling, while recall is doing very well. <br> Overall undersampling looks very bad.

#### Over and under sampling

In [16]:
from preprocessing import sampler
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42)

# over sample the minority
X_train, y_train = sampler(X_train, y_train)

lr_model(X_train, y_train, X_val, y_val, X_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     79600
           1       0.99      0.99      0.99     15920

    accuracy                           1.00     95520
   macro avg       0.99      1.00      1.00     95520
weighted avg       1.00      1.00      1.00     95520

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39800
           1       0.42      0.74      0.54        73

    accuracy                           1.00     39873
   macro avg       0.71      0.87      0.77     39873
weighted avg       1.00      1.00      1.00     39873

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.39      0.75      0.52       128

    accuracy                           1.0

It looks better than undersampling, but definitely not as good as without sampling. We will continue training the model without any resampling techniques. <br>

We will also replace train-test split with RandomizedSearchCV to even out the randomness in the validation data.

## KNN

We fit a KNN to the training data with the following hyperparameters: <br>
1. n_neighbors: 1-20
2. metric: 'euclidean', 'manhattan']<br>
We use pr_auc as scoring in random search.

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
score = make_scorer(pr_auc)

In [18]:
knn = KNeighborsClassifier()
parameter_space = {
    'n_neighbors': range(1, 21),
    'metric': ['euclidean', 'manhattan']
            }
knn_search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=parameter_space,
    scoring=score,
    n_iter=10,
    cv=10,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

knn_search.fit(X, y)
print('Best parameters are:', knn_search.best_params_)
print('Best pr_auc score is:', knn_search.best_score_)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 41.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 123.2min finished


Best parameters are: {'n_neighbors': 20, 'metric': 'euclidean'}
Best pr_auc score is: 0.8284858254052143


The result seems comparable to logistic regression. <br>
Let's test on test set. <br>

In [6]:
knn = KNeighborsClassifier(n_neighbors=20, metric='euclidean')
knn.fit(X, y)
y_pred = knn.predict(X)
y_test_pred = knn.predict(X_test)
print(classification_report(y, y_pred))
print('---------------------------------------------------------------------')
print(classification_report(y_test, y_test_pred))
print('---------------------------------------------------------------------')

print('Train pr_auc score', pr_auc(y, y_pred))
print('Test pr_auc score', pr_auc(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199000
           1       1.00      0.03      0.05       364

    accuracy                           1.00    199364
   macro avg       1.00      0.51      0.53    199364
weighted avg       1.00      1.00      1.00    199364

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       1.00      0.04      0.08       128

    accuracy                           1.00     85443
   macro avg       1.00      0.52      0.54     85443
weighted avg       1.00      1.00      1.00     85443

---------------------------------------------------------------------
Train pr_auc score 0.5146240870142879
Test pr_auc score 0.5202510280976792


The recall score is terrible! The RandomSearchCV best param is not working as it should!

### Random forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [19]:


rf = RandomForestClassifier()
parameter_space = {
                'n_estimators': [100, 500],
                'max_depth': [5, 10, 20, None],
                'min_samples_split': [2, 10],
                'min_samples_leaf': [1, 5, 10],
                'bootstrap': [True]
            }

rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=parameter_space,
    scoring=score,
    n_iter=10,
    cv=10,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X, y)
print('Best parameters are:', rf_search.best_params_)
print('Best pr_auc score is:', rf_search.best_score_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 54.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 166.9min finished


Best parameters are: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': None, 'bootstrap': True}
Best pr_auc score is: 0.8358577151012158


There is a slight increase in the pr_auc score. Let's fit the best model on the whole training data

In [10]:

rf = RandomForestClassifier(n_estimators=100, 
                            min_samples_split=2,
                           min_samples_leaf=5,
                           max_depth=None,
                           bootstrap=True)
rf.fit(X, y)
y_pred = rf.predict(X)
y_test_pred = rf.predict(X_test)
print(classification_report(y, y_pred))
print('---------------------------------------------------------------------')
print(classification_report(y_test, y_test_pred))
print('---------------------------------------------------------------------')

print('Train pr_auc score', pr_auc(y, y_pred))
print('Test pr_auc score', pr_auc(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199000
           1       0.98      0.83      0.90       364

    accuracy                           1.00    199364
   macro avg       0.99      0.92      0.95    199364
weighted avg       1.00      1.00      1.00    199364

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.87      0.76      0.81       128

    accuracy                           1.00     85443
   macro avg       0.94      0.88      0.91     85443
weighted avg       1.00      1.00      1.00     85443

---------------------------------------------------------------------
Train pr_auc score 0.9082448945889686
Test pr_auc score 0.8160245944249699


This is an improvement from logistic regression. Recall is still less than precision by 10 percent point. <br>

### XGBoost

We will try XGBoost in attempt to improve the scores.

In [58]:
import xgboost
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42)

In [60]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_train)
y_val_pred = xgb.predict(X_val)

print(classification_report(y_train, y_pred))
print('---------------------------------------------------------------------')
print(classification_report(y_val, y_val_pred))
print('---------------------------------------------------------------------')

print('Train pr_auc score', pr_auc(y_train, y_pred))
print('Val pr_auc score', pr_auc(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    159200
           1       1.00      1.00      1.00       291

    accuracy                           1.00    159491
   macro avg       1.00      1.00      1.00    159491
weighted avg       1.00      1.00      1.00    159491

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39800
           1       0.95      0.84      0.89        73

    accuracy                           1.00     39873
   macro avg       0.98      0.92      0.95     39873
weighted avg       1.00      1.00      1.00     39873

---------------------------------------------------------------------
Train pr_auc score 1.0
Val pr_auc score 0.8945211969449921


In [61]:
xgboost.XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

There seems to be a big increase in precision and recall scores. Acknowledging that there is still a big gap between train and validation, we are going to tune the hyperparameters. Have researched and tried to implement on sklearn's random search cv, but failed to find ways to incorperate early stopping. So we are going to implement without early stopping <br>

We are going to look at some of these hyperparameters, as per explained in this helpful video https://www.youtube.com/watch?v=AvWfL1Us3Kg and xgboost documentation https://xgboost.readthedocs.io/en/latest/parameter.html:<br>

1. **max_depth**: maxium depth of trees, usually range(2,30), default 3
2. **subsample**: subsample ratio of training instances, usually range (0.1,1), default 1
3. **colsample_bylevel**: subsample ratio of training features at each level, usually range (0.1,1), default 1
4. **colsample_bytree**: subsample ratio of training features at each tree, usually range (0.1,1), default 1
5. **min_child_weight**: minimum sum of instance weight (hessian) needed in a child, usually range(0,inf), default 1
6. **reg_lambda**: L2 regularization term on weights, default 1, the higher the higher regularisation
7. **learning_rate**: usually 0.01-1


In [76]:
param_space = {'n_estimators': [100, 200, 500],
               'max_depth': [3, 5, 10, 20],
               'min_child_weight': [1, 5, 10],
               'subsample': [0.5, 0.8, 1],
               'colsample_bytree': [0.8, 1]
              }

In [75]:
xgb = xgboost.XGBClassifier()
xgb_search = RandomizedSearchCV(estimator=xgb,
                                param_distributions=param_space,
                                scoring=score,
                                n_iter=10,
                                cv=10,
                                verbose=2,
                                random_state=42,
                                n_jobs=-1
                               )
xgb_search.fit(X, y)
print('Best parameters are:', rf_search.best_params_)
print('Best pr_auc score is:', rf_search.best_score_)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 55.8min finished


Best parameters are: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20, 'bootstrap': True}
Best pr_auc score is: -0.00029854944962667715


**Please note** the above code was run with the wrong param space, realised too late, not enough time to change. We presume the default is the best model and run below.

In [78]:
def fit_best_model(X, y, X_test, y_test):
    xgb = XGBClassifier()
    xgb.fit(X, y)
    y_pred = xgb.predict(X)
    y_test_pred = xgb.predict(X_test)

    print(classification_report(y, y_pred))
    print('---------------------------------------------------------------------')
    print(classification_report(y_test, y_test_pred))
    print('---------------------------------------------------------------------')

    print('Train pr_auc score', pr_auc(y, y_pred))
    print('Test pr_auc score', pr_auc(y_test, y_test_pred))

fit_best_model(X, y, X_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199000
           1       1.00      1.00      1.00       364

    accuracy                           1.00    199364
   macro avg       1.00      1.00      1.00    199364
weighted avg       1.00      1.00      1.00    199364

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.91      0.78      0.84       128

    accuracy                           1.00     85443
   macro avg       0.95      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443

---------------------------------------------------------------------
Train pr_auc score 1.0
Test pr_auc score 0.8453343064701294


Let's try using over and undersampling again with xgboost. From previous discussions, there is implication that sampling methods can be algorithm sensitive.

In [81]:
# SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.1)
X_train, y_train = smote.fit_resample(X, y)
fit_best_model(X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199000
           1       1.00      1.00      1.00     19900

    accuracy                           1.00    218900
   macro avg       1.00      1.00      1.00    218900
weighted avg       1.00      1.00      1.00    218900

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.83      0.81      0.82       128

    accuracy                           1.00     85443
   macro avg       0.92      0.91      0.91     85443
weighted avg       1.00      1.00      1.00     85443

---------------------------------------------------------------------
Train pr_auc score 1.0
Test pr_auc score 0.8223904445068642


In [87]:
# BorderlineSMOTE
smote = BorderlineSMOTE(random_state=42, sampling_strategy=0.1)
X_train, y_train = smote.fit_resample(X, y)
fit_best_model(X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199000
           1       1.00      1.00      1.00     19900

    accuracy                           1.00    218900
   macro avg       1.00      1.00      1.00    218900
weighted avg       1.00      1.00      1.00    218900

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.87      0.79      0.83       128

    accuracy                           1.00     85443
   macro avg       0.94      0.89      0.91     85443
weighted avg       1.00      1.00      1.00     85443

---------------------------------------------------------------------
Train pr_auc score 1.0
Test pr_auc score 0.8300340776564291


In [85]:
# Over and under sample
from preprocessing import sampler
X_train, y_train = sampler(X, y)
fit_best_model(X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     99500
           1       1.00      1.00      1.00     19900

    accuracy                           1.00    119400
   macro avg       1.00      1.00      1.00    119400
weighted avg       1.00      1.00      1.00    119400

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.83      0.80      0.81       128

    accuracy                           1.00     85443
   macro avg       0.91      0.90      0.91     85443
weighted avg       1.00      1.00      1.00     85443

---------------------------------------------------------------------
Train pr_auc score 1.0
Test pr_auc score 0.8132237945572329


It looks like over and under sampling didn't improve the result. 

### SVC

In [15]:
from sklearn.svm import SVC
param_space = {'C': [100, 10, 1.0, 0.1, 0.001]}
svc = SVC()
svc_search = RandomizedSearchCV(
    estimator=svc,
    param_distributions=param_space,
    scoring=score,
    n_iter=5,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1)
svc_search.fit(X, y)
print('Best C is: ', svc_search.best_params_)
print('Best score is:', svc_search.best_score_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  8.4min finished


Best C is:  {'C': 100}
Best score is: 0.5009129029814122


SVC seems to predict badly, so we will go back to random forest again, and try out different scoring methods

### Random forest with custom score randon search

In [27]:
train_data = pd.read_csv('creditcard_train.csv')
print('Average amount for fraud transaction amount', train_data[train_data['Class']==1]['Amount'].mean())

Average amount for fraud transaction amount 127.43673076923078


Imagine a credit card company would want to use the cost of recall and precision to set model. Here we wrote a function that takes in cost weights to calculate the scoring metric, rather than area under the curve. <br>

The cost of low precision: genuine transactions blocked, the customer would be unhappy which could cost the credit card company labour cost to put it right, and loss of business.<br>

The cost of low recall: fraud transaction going through, the credit card company will lose the transaction amount if the money can't be claimed back.<br>

We could devise two kinds of metrics:<br>

1. Static weighted metric: this metric is fixed and used cross all transactions regardless of the transaction amount.<br>
2. Dynamic weighted metric: this metric is dynamic. It changes as the transaction amount changes. <br>

Here we use custome static metric for simplification, and presume, average genuine transaction blocked would cost credit card company £50, and average fraud transaction going through £127. <br>

So the weights are precision: 50/(50+127)=0.28, and recall 0.72<br>

We also zoom in on the random forest parameter search area to where the best performing one was.

In [29]:
from metrics import custom_metric

score = make_scorer(custom_metric)

rf = RandomForestClassifier()
parameter_space = {
                'n_estimators': [50, 100, 200],
                'max_depth': [20, None],
                'min_samples_split': [1, 2, 3, 5],
                'min_samples_leaf': [2, 5, 8],
                'bootstrap': [True]
            }

rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=parameter_space,
    scoring=score,
    n_iter=10,
    cv=10,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X, y)
print('Best parameters are:', rf_search.best_params_)
print('Best custom_metric score is:', rf_search.best_score_)



Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 36.1min finished


Best parameters are: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20, 'bootstrap': True}
Best custom_metric score is: -0.00029854944962667715


Now fit the model again with the best parameters. And test on test data

In [31]:
rf = RandomForestClassifier(n_estimators=100, 
                            min_samples_split=5,
                           min_samples_leaf=2,
                           max_depth=20,
                           bootstrap=True)
rf.fit(X, y)
y_pred = rf.predict(X)
y_test_pred = rf.predict(X_test)
print(classification_report(y, y_pred))
print('---------------------------------------------------------------------')
print(classification_report(y_test, y_test_pred))
print('---------------------------------------------------------------------')

print('Train pr_auc score', pr_auc(y, y_pred))
print('Test pr_auc score', pr_auc(y_test, y_test_pred))
print('The custom score for train data is', custom_metric(y, y_pred))
print('The custom score for test data is', custom_metric(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199000
           1       0.99      0.85      0.92       364

    accuracy                           1.00    199364
   macro avg       1.00      0.93      0.96    199364
weighted avg       1.00      1.00      1.00    199364

---------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85315
           1       0.89      0.76      0.82       128

    accuracy                           1.00     85443
   macro avg       0.94      0.88      0.91     85443
weighted avg       1.00      1.00      1.00     85443

---------------------------------------------------------------------
Train pr_auc score 0.9227544782885767
Test pr_auc score 0.8240417859283999
The custom score for train data is -0.00028089324050480525
The custom score for train data is -0.0005032594829301406
