## Imports

In [48]:
import pandas as pd
import numpy as np

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [50]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [51]:
data_path  =r'/home/alahira/Documents/Data science projects/Xente_Fraud_Detection_Challenge/Data/'

In [52]:
## load datasets
num_plus_dummies = pd.read_csv(data_path + 'processed/num_plus_dummies.csv')

In [53]:
num_plus_dummies.head(5)

Unnamed: 0.1,Unnamed: 0,Amount,Value,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,ProductId_ProductId_1,...,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,TransactionYear,TransactionMonth,TransactionDay,TransactionHour,FraudResult
0,0,-0.046371,-0.072291,0,0,0,0,0,1,0,...,0,0,0,1,0,2018,11,15,2,0
1,1,-0.054643,-0.080251,0,0,0,1,0,0,0,...,0,0,1,0,0,2018,11,15,2,0
2,2,-0.050426,-0.076352,0,0,0,0,0,1,1,...,0,0,0,1,0,2018,11,15,2,0
3,3,0.107717,0.096648,1,0,0,0,0,0,0,...,1,0,0,1,0,2018,11,15,3,0
4,4,-0.059704,-0.075183,0,0,0,1,0,0,0,...,0,0,1,0,0,2018,11,15,3,0


In [54]:
## drop unnamed column
num_plus_dummies.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

**Modeling using Random Forest Classifier**

In [55]:
## split dataset into x and y variables
x = num_plus_dummies.drop(labels = 'FraudResult', axis = 1)
y = num_plus_dummies['FraudResult']

In [56]:
def split_fit(algorithm, x, y):
    """Splits and fits training dataset 
    
        Parameters:
        
            algorithm (scikit-learn algorithm): machine learning algorithm to make 
                        predictions
            x(iterable) : x
            y(iterable) : y
        
        Returns:
            fitted model, x_test and y_test variables"""
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= 10, train_size =0.75)
    
    ##instantiate and fit algorithm
    algorithm = algorithm()
    fitted_algorithm = algorithm.fit(x_train, y_train)
    
    return fitted_algorithm, x_test, y_test

In [57]:
first_rfc, x_test, y_test = split_fit(RandomForestClassifier, x, y)
first_rfc_pred = rfc.predict(x_test)

In [58]:
## predict using fitted model
first_rf_preds = rf.predict(x_test)

**Check metrics**

In [59]:
confusion_matrix(y_test, first_rf_preds )

array([[23865,     8],
       [    2,    41]])

In [60]:
f1_score(y_test, first_rf_preds )

0.8913043478260869

In [61]:
accuracy_score(y_test, rf_preds)

0.999581869877906

<br />

**Note**

Due to the inbalanced target classes, the f1_score would be a more suitable metric. Consequently, we could try to randomly the majority class and oversample the minority class in the data    

In [62]:
# undersample and over sample target classes
under = RandomUnderSampler(sampling_strategy=0.5)
over = SMOTE(sampling_strategy=0.1)

In [63]:
## create resampling pipeline
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [64]:
## fit and resample dataset
x, y= pipeline.fit_resample(x, y)

In [65]:
## split and fit dataset

second_rfc, x_test, y_test = split_fit(RandomForestClassifier, x, y)
second_rfc_pred = rfc.predict(x_test)

In [66]:
## check confusion matrix
confusion_matrix(y_test, second_rfc_pred)

array([[4747,    2],
       [   1, 2410]])

In [67]:
## cheeck acccuracy and f1_score
print (f'f1_score : {f1_score(y_test, second_rfc_pred)}')
print (f'accracy_score : {accuracy_score(y_test, second_rfc_pred)}')

f1_score : 0.9993779805100559
accracy_score : 0.9995810055865921


<br />

**Note**

f1_score has increased significantly