## Imports

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [4]:
data_path  =r'/home/alahira/Documents/Data science projects/Xente_Fraud_Detection_Challenge/Data/'

In [11]:
## load datasets
num_plus_dummies = pd.read_csv(data_path + 'processed/num_plus_dummies.csv')

In [12]:
num_plus_dummies.head(5)

Unnamed: 0.1,Unnamed: 0,Amount,Value,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,ProductId_ProductId_1,...,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,TransactionYear,TransactionMonth,TransactionDay,TransactionHour,FraudResult
0,0,-0.046371,-0.072291,0,0,0,0,0,1,0,...,0,0,0,1,0,2018,11,15,2,0
1,1,-0.054643,-0.080251,0,0,0,1,0,0,0,...,0,0,1,0,0,2018,11,15,2,0
2,2,-0.050426,-0.076352,0,0,0,0,0,1,1,...,0,0,0,1,0,2018,11,15,2,0
3,3,0.107717,0.096648,1,0,0,0,0,0,0,...,1,0,0,1,0,2018,11,15,3,0
4,4,-0.059704,-0.075183,0,0,0,1,0,0,0,...,0,0,1,0,0,2018,11,15,3,0


In [13]:
## drop unnamed column
num_plus_dummies.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

**Modeling using Random Forest Classifier**

In [14]:
## split dataset into x and y variables
x = num_plus_dummies.drop(labels = 'FraudResult', axis = 1)
y = num_plus_dummies['FraudResult']

In [15]:
def split_fit(algorithm, x, y):
    """Splits and fits training dataset 
    
        Parameters:
        
            algorithm (scikit-learn algorithm): machine learning algorithm to make 
                        predictions
            x(iterable) : x
            y(iterable) : y
        
        Returns:
            fitted model, x_test and y_test variables"""
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= 10, train_size =0.75)
    
    ##instantiate and fit algorithm
    algorithm = algorithm()
    fitted_algorithm = algorithm.fit(x_train, y_train)
    
    return fitted_algorithm, x_test, y_test

In [21]:
first_rf, x_test, y_test = split_fit(RandomForestClassifier, x, y)
first_rf_preds = first_rfc.predict(x_test)

In [None]:
## predict using fitted model
#first_rf_preds = rf.predict(x_test)

**Check metrics**

In [22]:
confusion_matrix(y_test, first_rf_preds )

array([[23865,     8],
       [    2,    41]])

In [23]:
f1_score(y_test, first_rf_preds )

0.8913043478260869

In [24]:
accuracy_score(y_test, first_rf_preds)

0.999581869877906

<br />

**Note**

Due to the inbalanced target classes, the f1_score would be a more suitable metric. Consequently, we could try to randomly the majority class and oversample the minority class in the data    

In [25]:
# undersample and over sample target classes
under = RandomUnderSampler(sampling_strategy=0.5)
over = SMOTE(sampling_strategy=0.1)

In [26]:
## create resampling pipeline
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [27]:
## fit and resample dataset
x, y= pipeline.fit_resample(x, y)

In [29]:
## split and fit dataset

second_rf, x_test, y_test = split_fit(RandomForestClassifier, x, y)
second_rf_pred = second_rf.predict(x_test)

In [30]:
## check confusion matrix
confusion_matrix(y_test, second_rf_pred)

array([[4743,    6],
       [   1, 2410]])

In [32]:
## cheeck acccuracy and f1_score
print (f'f1_score : {f1_score(y_test, second_rf_pred)}')
print (f'accracy_score : {accuracy_score(y_test, second_rf_pred)}')

f1_score : 0.9985498239071887
accracy_score : 0.9990223463687151


<br />

**Note**

f1_score has increased significantly

## Feature Importance

In [53]:
# get feature importance 
feature_importances = pd.DataFrame(second_rf.feature_importances_,
                                   index = x_test.columns,
                                    columns=['Feature Importance']).sort_values('Feature Importance', ascending=False)

In [54]:
feature_importances

Unnamed: 0,Feature Importance
Value,0.3366047
Amount,0.3288208
ProductCategory_airtime,0.06785291
ProviderId_ProviderId_4,0.04213964
ProductId_ProductId_15,0.0370313
ChannelId_ChannelId_2,0.03135757
ProductId_ProductId_6,0.02849611
ProviderId_ProviderId_6,0.02548432
ProductId_ProductId_3,0.01485438
ChannelId_ChannelId_3,0.01201703


In [64]:
## create new dataframe using top ten most important features
important_features = x[['Value','Amount','ProductCategory_airtime',
                                      'ProviderId_ProviderId_4','ProductId_ProductId_15',
                                      'ChannelId_ChannelId_2','ProductId_ProductId_6',
                                      'ProviderId_ProviderId_6','ProductId_ProductId_3','ChannelId_ChannelId_3']]

In [66]:
third_rf, x_test,y_test = split_fit(RandomForestClassifier, important_features, y)

In [67]:
third_rf_preds = third_rf.predict(x_test)

In [68]:
## cheeck acccuracy and f1_score
print (f'f1_score : {f1_score(y_test, third_rf_preds)}')
print (f'accracy_score : {accuracy_score(y_test, third_rf_preds)}')

f1_score : 0.9895963379109446
accracy_score : 0.9930167597765364


<br />

**Note**

Model performance has reduced though by a number that is almost negligible. 