In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv("processed_train.csv")
X_test = pd.read_csv("processed_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

#### Dropping extra columns form the train and test data

In [3]:
y_train = y_train.drop('Unnamed: 0', axis=1)
y_test = y_test.drop('Unnamed: 0', axis=1)
y_test.head()

Unnamed: 0,is_fraud
0,0
1,0
2,0
3,0
4,0


In [4]:
X_train = X_train.drop('Unnamed: 0', axis=1)
X_test = X_test.drop('Unnamed: 0', axis=1)
X_train.head()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,age,food_dining,gas_transport,grocery_net,...,06,07,08,09,10,11,12,6-12,12-18,18-24
0,20.0,47.1709,-100.7944,1190,46.398331,-99.813959,70,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,284.88,46.5901,-117.1692,761,45.687331,-117.488135,75,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,5.07,27.9551,-82.2966,79613,27.254081,-81.974799,41,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,45.38,34.077,-84.3033,165556,34.551957,-83.374265,21,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4,27.52,43.6088,-83.953,67858,43.032957,-83.521294,24,0,0,0,...,0,0,1,0,0,0,0,0,0,1


### Creating a Baseline Model before Balancing the data using Sampling techniques

#### 1. Logistic Regression

In [5]:
# Logistic Regression
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

In [6]:
# Model Evaluation 1. Logistic Regression

def print_eval(y_pred, model):
    print("Training Accuracy: ", model.score(X_train, y_train))
    print("Testing Accuracy: ", model.score(X_test, y_test))
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    precision = precision_score(y_test, y_pred)
    print('Precision: %.3f' % precision)
    
    recall = recall_score(y_test, y_pred)
    print('Recall: %.3f' % recall)
    
    score = f1_score(y_test, y_pred)
    print('F-Measure: %.3f' % score)
    
    print(classification_report(y_test, y_pred))

In [7]:
print_eval(y_pred_lr, model_lr)

Training Accuracy:  0.9940231977049079
Testing Accuracy:  0.9947095659684734
[[27639    17]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           0.99     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      0.99      0.99     27786



In [8]:
# Model Evaluation 2. Random Forest

model_rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print_eval(y_pred_rf, model_rf)

Training Accuracy:  0.9945630379122066
Testing Accuracy:  0.9953213848700785
[[27656     0]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           1.00     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      1.00      0.99     27786



#### 3. Decision Tree

In [9]:
# Model Evaluation 3. Decision tree

model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

print_eval(y_pred_dt, model_dt)

Training Accuracy:  1.0
Testing Accuracy:  0.9966529907147484
[[27610    46]
 [   47    83]]
Precision: 0.643
Recall: 0.638
F-Measure: 0.641
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.64      0.64      0.64       130

    accuracy                           1.00     27786
   macro avg       0.82      0.82      0.82     27786
weighted avg       1.00      1.00      1.00     27786



### Methods to handling the Imbalance in data 

   
Handling the Imbalance in data by one of the following methods:   
> 4.1 Random Under-Sampling   
       4.2 Random Over-Sampling   
       4.3 SMOTE (Synthetic Minority over sampling technique)   
       4.4 Near Miss algorighm ( under sampling )   
       4.5 Ensemble method 
   

### 1. Random Under-Sampling

#### Concatenating X_train and y_train before doing sampling

In [10]:
# Adding the dependent feature in the train data set

print(y_train.shape, X_train.shape)
df_train = pd.concat([X_train, y_train], axis = 1)
print("shape of train set : ", df_train.shape)

# Class count
count_class_0, count_class_1 = df_train.is_fraud.value_counts()
print(count_class_0, count_class_1)

# Divide by class
df_class_0 = df_train[df_train['is_fraud'] == 0]
df_class_1 = df_train[df_train['is_fraud'] == 1]
print("Non fraud cases : ", df_class_0.shape, "Fraud cases : ", df_class_1.shape)

(129668, 1) (129668, 571)
shape of train set :  (129668, 572)
128963 705
Non fraud cases :  (128963, 572) Fraud cases :  (705, 572)


#### Undersampling : 
We randomly drop the samples from the majority class(class 0) to balance both the classes 0(normal) and 1(fraud).

In [11]:
# Undersample 0-class and concat the DataFrames of both class

df_class_0_under = df_class_0.sample(count_class_1)
df_train_undersample = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_train_undersample.is_fraud.value_counts())

X = df_train_undersample.drop('is_fraud', axis='columns')
y = df_train_undersample['is_fraud']

Random under-sampling:
1    705
0    705
Name: is_fraud, dtype: int64


#### 1. Logistic regression after under sampling

In [12]:
# training and predictions : Logistic Regression

model_lr_undersample = LogisticRegression(random_state=42)
model_lr_undersample.fit(X, y)
y_pred_lr_us = model_lr_undersample.predict(X_test)

print_eval(y_pred_lr_us, model_lr_undersample)

Training Accuracy:  0.9464401394330135
Testing Accuracy:  0.9474915425034189
[[26227  1429]
 [   30   100]]
Precision: 0.065
Recall: 0.769
F-Measure: 0.121
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     27656
           1       0.07      0.77      0.12       130

    accuracy                           0.95     27786
   macro avg       0.53      0.86      0.55     27786
weighted avg       0.99      0.95      0.97     27786



#### 2. Random Forest after under sampling

In [13]:
# training and predictions : random forest

model_rf_undersample = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, random_state=42)
model_rf_undersample.fit(X, y)
y_pred_rf_us = model_rf_undersample.predict(X_test)

print_eval(y_pred_rf_us, model_rf_undersample)

Training Accuracy:  0.9572600795878705
Testing Accuracy:  0.9589361548981501
[[26545  1111]
 [   30   100]]
Precision: 0.083
Recall: 0.769
F-Measure: 0.149
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27656
           1       0.08      0.77      0.15       130

    accuracy                           0.96     27786
   macro avg       0.54      0.86      0.56     27786
weighted avg       0.99      0.96      0.98     27786



#### 3. Decision Tree after under sampling

In [14]:
# training and predictions : decision tree

model_dt_undersample = DecisionTreeClassifier(random_state=42)
model_dt_undersample.fit(X,y)
y_pred_dt_us = model_dt_undersample.predict(X_test)

print_eval(y_pred_dt_us, model_dt_undersample)

Training Accuracy:  0.9261498596415461
Testing Accuracy:  0.9271935507089901
[[25635  2021]
 [    2   128]]
Precision: 0.060
Recall: 0.985
F-Measure: 0.112
              precision    recall  f1-score   support

           0       1.00      0.93      0.96     27656
           1       0.06      0.98      0.11       130

    accuracy                           0.93     27786
   macro avg       0.53      0.96      0.54     27786
weighted avg       1.00      0.93      0.96     27786



### Comparision of results and Observations :

|                                       | Precision|    |     Recall    | f1-score       |
|---------------------------------------|---------------|---------------|----------------|
|Logistic Regression                    |     1.00      |     0.00      |     1.00       |
|Decision Tree Classifier               |     1.00      |     0.64      |     1.00       |  
|Random Forest Classifier               |     1.00      |     0.00      |     1.00       | 
|Logistic Regression Under Sampling     |     0.96      |     0.77      |     0.98       |
|Decision Tree Classifier Under Sampling|     0.93      |     0.95      |     0.96       | 
|Random Forest Classifier Under Sampling|     0.96      |     0.78      |     0.98       |  


### 2. Random Over Sampling

#### Over Sampling:
We randomly replicate the samples from minority class (class 1) to balance it with the majority class. It outperforms under sampling because there is no data loss but overfitting can be an issue.

In [15]:
# Oversample 1-class and concat the DataFrames of both class

df_class_1_over = df_class_1.sample(count_class_0,replace=True)
df_train_oversample = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_train_oversample.is_fraud.value_counts())

X = df_train_oversample.drop('is_fraud', axis='columns')
y = df_train_oversample['is_fraud']

Random over-sampling:
1    128963
0    128963
Name: is_fraud, dtype: int64


#### 1. Logistic Regression after Over Sampling

In [16]:
# training and predictions : Logistic Regression

model_lr_oversample = LogisticRegression(random_state=42)
model_lr_oversample.fit(X, y)
y_pred_lr_os = model_lr_oversample.predict(X_test)

print_eval(y_pred_lr_os, model_lr_oversample)

Training Accuracy:  0.9493167165376192
Testing Accuracy:  0.9510184985244368
[[26325  1331]
 [   30   100]]
Precision: 0.070
Recall: 0.769
F-Measure: 0.128
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     27656
           1       0.07      0.77      0.13       130

    accuracy                           0.95     27786
   macro avg       0.53      0.86      0.55     27786
weighted avg       0.99      0.95      0.97     27786



#### 2. Random Forest after Over Sampling

In [17]:
# training and predictions : random forest

model_rf_oversample = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, random_state=42)
model_rf_oversample.fit(X, y)
y_pred_rf_os = model_rf_oversample.predict(X_test)

print_eval(y_pred_rf_os, model_rf_oversample)

Training Accuracy:  0.966051762963877
Testing Accuracy:  0.9667458432304038
[[26765   891]
 [   33    97]]
Precision: 0.098
Recall: 0.746
F-Measure: 0.174
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     27656
           1       0.10      0.75      0.17       130

    accuracy                           0.97     27786
   macro avg       0.55      0.86      0.58     27786
weighted avg       0.99      0.97      0.98     27786



#### 3. Decision Tree after Over Sampling

In [18]:
# training and predictions : decision tree

model_dt_oversample = DecisionTreeClassifier(random_state=42)
model_dt_oversample.fit(X,y)
y_pred_dt_os = model_dt_oversample.predict(X_test)

print_eval(y_pred_dt_os, model_dt_oversample)

Training Accuracy:  1.0
Testing Accuracy:  0.9961491398546031
[[27594    62]
 [   45    85]]
Precision: 0.578
Recall: 0.654
F-Measure: 0.614
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.58      0.65      0.61       130

    accuracy                           1.00     27786
   macro avg       0.79      0.83      0.81     27786
weighted avg       1.00      1.00      1.00     27786



### Comparision of results and Observations :

|                                       | Class-0 Recall| Class-1 Recall|Class-0 f1-score| Class-1 f1-score|
|---------------------------------------|---------------|---------------|----------------|-----------------|
|Logistic Regression                    |     1.00      |     0.00      |     1.00       |      0.00       |
|Decision Tree Classifier               |     1.00      |     0.64      |     1.00       |      0.64       |
|Random Forest Classifier               |     1.00      |     0.00      |     1.00       |      0.00       |
|Logistic Regression Under Sampling     |     0.96      |     0.77      |     0.98       |      0.14       |
|Decision Tree Classifier Under Sampling|     0.93      |     0.95      |     0.96       |      0.11       |
|Random Forest Classifier Under Sampling|     0.96      |     0.78      |     0.98       |      0.15       |
|Logistic Regression Over Sampling      |     0.95      |     0.77      |     0.97       |      0.13       |
|Decision Tree Classifier Over Sampling |     1.00      |     0.68      |     1.00       |      0.64       |
|Random Forest Classifier Over Sampling |     0.96      |     0.75      |     0.98       |      0.13       |

### 3. SMOTE (Synthetic Minority Over Sampling Technique) - Informed Over Sampling

- Choose minority class as the input vector
- Find its k nearest neighbors (k_neighbors is specified as an argument in the SMOTE() function)
- Choose one of these neighbors and place a synthetic point anywhere on the line joining the point under consideration and its chosen neighbor
- Repeat the steps until data is balanced

SMOTE mitigates the problem of overfitting caused by random oversampling as synthetic examples are generated rather than replicating the existing samples. It's not very effective for high dimensional data.

In [19]:
# smote implementation
smote = SMOTE(sampling_strategy = 'minority')
X_sm, y_sm = smote.fit_sample(X_train, y_train)
y_sm.value_counts()

is_fraud
1           128963
0           128963
dtype: int64

#### 1. Logistic Regression after SMOTE:

In [20]:
# training and predictions : Logistic Regression

model_lr_smote = LogisticRegression(random_state=42)
model_lr_smote.fit(X_sm, y_sm)
y_pred_lr_sm = model_lr_smote.predict(X_test)

print_eval(y_pred_lr_sm, model_lr_smote)

Training Accuracy:  0.949239596508005
Testing Accuracy:  0.9505866263585978
[[26313  1343]
 [   30   100]]
Precision: 0.069
Recall: 0.769
F-Measure: 0.127
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     27656
           1       0.07      0.77      0.13       130

    accuracy                           0.95     27786
   macro avg       0.53      0.86      0.55     27786
weighted avg       0.99      0.95      0.97     27786



#### 2. Random Forest after SMOTE:

In [21]:
# training and predictions : random forest

model_rf_smote = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, random_state=42)
model_rf_smote.fit(X_sm, y_sm)
y_pred_rf_sm = model_rf_smote.predict(X_test)

print_eval(y_pred_rf_sm, model_rf_smote)

Training Accuracy:  0.9823549372242959
Testing Accuracy:  0.9871518030662924
[[27342   314]
 [   43    87]]
Precision: 0.217
Recall: 0.669
F-Measure: 0.328
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27656
           1       0.22      0.67      0.33       130

    accuracy                           0.99     27786
   macro avg       0.61      0.83      0.66     27786
weighted avg       0.99      0.99      0.99     27786



#### 3. Decision Tree after SMOTE:

In [22]:
# training and predictions : decision tree

model_dt_smote = DecisionTreeClassifier(random_state=42)
model_dt_smote.fit(X_sm,y_sm)
y_pred_dt_sm = model_dt_smote.predict(X_test)

print_eval(y_pred_dt_sm, model_dt_smote)

Training Accuracy:  1.0
Testing Accuracy:  0.9916144821132945
[[27502   154]
 [   79    51]]
Precision: 0.249
Recall: 0.392
F-Measure: 0.304
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     27656
           1       0.25      0.39      0.30       130

    accuracy                           0.99     27786
   macro avg       0.62      0.69      0.65     27786
weighted avg       0.99      0.99      0.99     27786



### Comparision of results and Observations :

|                                       | Class-0 Recall| Class-1 Recall|Class-0 f1-score| Class-1 f1-score|
|---------------------------------------|---------------|---------------|----------------|-----------------|
|Logistic Regression                    |     1.00      |     0.00      |     1.00       |      0.00       |
|Decision Tree Classifier               |     1.00      |     0.64      |     1.00       |      0.64       |
|Random Forest Classifier               |     1.00      |     0.00      |     1.00       |      0.00       |
|Logistic Regression Under Sampling     |     0.96      |     0.77      |     0.98       |      0.14       |
|Decision Tree Classifier Under Sampling|     0.93      |     0.95      |     0.96       |      0.11       |
|Random Forest Classifier Under Sampling|     0.96      |     0.78      |     0.98       |      0.15       |
|Logistic Regression Over Sampling      |     0.95      |     0.77      |     0.97       |      0.13       |
|Decision Tree Classifier Over Sampling |     1.00      |     0.68      |     1.00       |      0.64       |
|Random Forest Classifier Over Sampling |     0.96      |     0.75      |     0.98       |      0.13       |
|Logistic Regression SMOTE              |     0.95      |     0.77      |     0.97       |      0.13       |
|Decision Tree Classifier SMOTE         |     0.99      |     0.39      |     1.00       |      0.31       |
|Random Forest Classifier SMOTE         |     0.99      |     0.66      |     0.99       |      0.33       |