In [58]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [26]:
X_train = pd.read_csv("processed_train.csv")
X_test = pd.read_csv("processed_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

In [34]:
y_train = y_train.drop('Unnamed: 0', axis=1)
y_test = y_test.drop('Unnamed: 0', axis=1)
y_test.head()

Unnamed: 0,is_fraud
0,0
1,0
2,0
3,0
4,0


In [41]:
X_train = X_train.drop('Unnamed: 0', axis=1)
X_test = X_test.drop('Unnamed: 0', axis=1)
X_train.head()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,age,food_dining,gas_transport,grocery_net,...,06,07,08,09,10,11,12,6-12,12-18,18-24
0,20.0,47.1709,-100.7944,1190,46.398331,-99.813959,70,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,284.88,46.5901,-117.1692,761,45.687331,-117.488135,75,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,5.07,27.9551,-82.2966,79613,27.254081,-81.974799,41,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,45.38,34.077,-84.3033,165556,34.551957,-83.374265,21,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4,27.52,43.6088,-83.953,67858,43.032957,-83.521294,24,0,0,0,...,0,0,1,0,0,0,0,0,0,1


### Creating a Baseline Model before Balancing the data using Sampling techniques

#### 1. Logistic Regression

In [70]:
# Logistic Regression
model_LR = LogisticRegression(random_state=42)
model_LR.fit(X_train, y_train)
y_pred = model_LR.predict(X_test)

In [71]:
# Model Evaluation 1. Logistic Regression

def print_eval(y_pred, model):
    print("Training Accuracy: ", model.score(X_train, y_train))
    print("Testing Accuracy: ", model.score(X_test, y_test))
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(classification_report(y_test, y_pred))

In [72]:
print_eval(y_pred, model_LR)

Training Accuracy:  0.9940231977049079
Testing Accuracy:  0.9947095659684734
[[27639    17]
 [  130     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           0.99     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      0.99      0.99     27786



#### 2. Random Forest

In [84]:
# Model Evaluation 2. Random Forest

model_rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print_eval(y_pred_rf, model_rf)

Training Accuracy:  0.9945630379122066
Testing Accuracy:  0.9953213848700785
[[27656     0]
 [  130     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           1.00     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      1.00      0.99     27786



#### 3. Decision Tree

In [85]:
# Model Evaluation 3. Decision tree

model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

print_eval(y_pred_dt, model_dt)

Training Accuracy:  1.0
Testing Accuracy:  0.9966529907147484
[[27610    46]
 [   47    83]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.64      0.64      0.64       130

    accuracy                           1.00     27786
   macro avg       0.82      0.82      0.82     27786
weighted avg       1.00      1.00      1.00     27786



### Methods to handling the Imbalance in data 

   
Handling the Imbalance in data by one of the following methods:   
> 4.1 Random Under-Sampling   
       4.2 Random Over-Sampling   
       4.3 SMOTE (Synthetic Minority over sampling technique)   
       4.4 Near Miss algorighm ( under sampling )   
       4.5 Ensemble method 
   

### 1. Random Under-Sampling

In [79]:
# Adding the dependent feature in the train data set

print(y_train.shape, X_train.shape)
df_train = pd.concat([X_train, y_train], axis = 1)
print("shape of train set : ", df_train.shape)

# Class count
count_class_0, count_class_1 = df_train.is_fraud.value_counts()
print(count_class_0, count_class_1)

# Divide by class
df_class_0 = df_train[df_train['is_fraud'] == 0]
df_class_1 = df_train[df_train['is_fraud'] == 1]
print("Non fraud cases : ", df_class_0.shape, "Fraud cases : ", df_class_1.shape)

(129668, 1) (129668, 571)
shape of train set :  (129668, 572)
128963 705
Non fraud cases :  (128963, 572) Fraud cases :  (705, 572)


In [80]:
# Undersample 0-class and concat the DataFrames of both class

df_class_0_under = df_class_0.sample(count_class_1)
df_train_undersample = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_train_undersample.is_fraud.value_counts())

X = df_train_undersample.drop('is_fraud', axis='columns')
y = df_train_undersample['is_fraud']

Random under-sampling:
1    705
0    705
Name: is_fraud, dtype: int64


#### 1. Logistic regression after under sampling

In [82]:
# training and predictions : Logistic Regression

model_lr_undersample = LogisticRegression(random_state=42)
model_lr_undersample.fit(X, y)
y_pred_lr_us = model_lr_undersample.predict(X_test)

print_eval(y_pred_lr_us, model_lr_undersample)

Training Accuracy:  0.9542601104358824
Testing Accuracy:  0.9557690923486648
[[26457  1199]
 [   30   100]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27656
           1       0.08      0.77      0.14       130

    accuracy                           0.96     27786
   macro avg       0.54      0.86      0.56     27786
weighted avg       0.99      0.96      0.97     27786



#### 2. Random Forest after under sampling

In [86]:
# training and predictions : random forest

model_rf_undersample = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, random_state=42)
model_rf_undersample.fit(X, y)
y_pred_rf_us = model_rf_undersample.predict(X_test)

print_eval(y_pred_rf_us, model_rf_undersample)

Training Accuracy:  0.9586019681031558
Testing Accuracy:  0.9594759951054488
[[26559  1097]
 [   29   101]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27656
           1       0.08      0.78      0.15       130

    accuracy                           0.96     27786
   macro avg       0.54      0.87      0.57     27786
weighted avg       0.99      0.96      0.98     27786



#### 3. Decision Tree after under sampling

In [87]:
# training and predictions : decision tree

model_dt_undersample = DecisionTreeClassifier(random_state=42)
model_dt_undersample.fit(X,y)
y_pred_dt_us = model_dt_undersample.predict(X_test)

print_eval(y_pred_dt_us, model_dt_undersample)

Training Accuracy:  0.9303914612703211
Testing Accuracy:  0.9311523788958468
[[25750  1906]
 [    7   123]]
              precision    recall  f1-score   support

           0       1.00      0.93      0.96     27656
           1       0.06      0.95      0.11       130

    accuracy                           0.93     27786
   macro avg       0.53      0.94      0.54     27786
weighted avg       1.00      0.93      0.96     27786



### Comparision of results and Observations :

|                                       | Class-0 Recall| Class-1 Recall|Class-0 f1-score| Class-1 f1-score|
|---------------------------------------|---------------|---------------|----------------|-----------------|
|Logistic Regression                    |     1.00      |     0.00      |     1.00       |      0.00       |
|Decision Tree Classifier               |     0.99      |     0.64      |     0.99       |      0.31       |
|Random Forest Classifier               |     0.99      |     0.64      |     0.99       |      0.31       |
|Logistic Regression Under Sampling     |     1.00      |     0.00      |     1.00       |      0.50       |
|Decision Tree Classifier Under Sampling|     0.94      |     0.95      |     0.97       |      0.13       |
|Random Forest Classifier Under Sampling|     0.99      |     0.64      |     0.99       |      0.31       |
