In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

#### Loading Preprocessed data

In [2]:
X_train = pd.read_csv("processed_train.csv")
X_test = pd.read_csv("processed_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

In [3]:
# Dropping additional column

X_sm = X_train.drop('Unnamed: 0', axis=1)
X_test = X_test.drop('Unnamed: 0', axis=1)
y_sm = y_train.drop('Unnamed: 0', axis=1)
y_test = y_test.drop('Unnamed: 0', axis=1)



#### Model Evaluation Function

In [4]:
# Model Evaluation

def print_eval(y_pred, model):
    print("Training Accuracy: ", model.score(X_sm, y_sm))
    print("Testing Accuracy: ", model.score(X_test, y_test))
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    precision = precision_score(y_test, y_pred)
    print('Precision: %.3f' % precision)
    
    recall = recall_score(y_test, y_pred)
    print('Recall: %.3f' % recall)
    
    score = f1_score(y_test, y_pred)
    print('F-Measure: %.3f' % score)
    
    print(classification_report(y_test, y_pred))

### Model Implementation:

>1	Logistic Regression   
2	SVM    
3	Random Forest   
4	Decision Tree  
5   Gradient Boosting
6	Artificial Neural Network   
7   Penalized-SVM 

### 1. Logistic Regression

In [5]:
# training and predictions : Logistic Regression

model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_sm, y_sm)
y_pred_lr = model_lr.predict(X_test)

print_eval(y_pred_lr, model_lr)

Training Accuracy:  0.9940231977049079
Testing Accuracy:  0.9947095659684734
[[27639    17]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           0.99     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      0.99      0.99     27786



### 2. SVM

#### 2.1 SVM with linear kernel and c=1

In [7]:
# training and predictions : SVM

model_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=1, loss="hinge", random_state=42)),
    ])

model_svm.fit(X_sm, y_sm)

y_pred_svm = model_svm.predict(X_test)
print_eval(y_pred_svm, model_svm)

Training Accuracy:  0.9942622697967116
Testing Accuracy:  0.9948535233570863
[[27642    14]
 [  129     1]]
Precision: 0.067
Recall: 0.008
F-Measure: 0.014
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.07      0.01      0.01       130

    accuracy                           0.99     27786
   macro avg       0.53      0.50      0.51     27786
weighted avg       0.99      0.99      0.99     27786



#### 2.2 SVM with linear kernel and c=50

In [8]:
model_svm1 = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=50, loss="hinge", random_state=42)),
    ])

model_svm1.fit(X_sm, y_sm)

y_pred_svm1 = model_svm1.predict(X_test)

print_eval(y_pred_svm1, model_svm1)

Training Accuracy:  0.9890798038066446
Testing Accuracy:  0.9886273662995754
[[27470   186]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27656
           1       0.00      0.00      0.00       130

    accuracy                           0.99     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      0.99      0.99     27786



#### 2.3 SVM with linear kernel, c=0.01

In [9]:
model_svm2 = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=0.001, loss="hinge", random_state=42)),
    ])

model_svm2.fit(X_sm, y_sm)

y_pred_svm2 = model_svm2.predict(X_test)

print_eval(y_pred_svm2, model_svm2)

Training Accuracy:  0.9947018539655119
Testing Accuracy:  0.9953213848700785
[[27656     0]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           1.00     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      1.00      0.99     27786



#### 2.4 SVM with polynomial kernel, degree=2,  c=1 and coef=1

In [10]:
model_svm_poly = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=2, coef0=1, C=1))
    ])

model_svm_poly.fit(X_sm, y_sm)
y_pred_svm_poly = model_svm_poly.predict(X_test)

print_eval(y_pred_svm_poly, model_svm_poly)

Training Accuracy:  0.9961825585341024
Testing Accuracy:  0.9955733103001512
[[27650     6]
 [  117    13]]
Precision: 0.684
Recall: 0.100
F-Measure: 0.174
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.68      0.10      0.17       130

    accuracy                           1.00     27786
   macro avg       0.84      0.55      0.59     27786
weighted avg       0.99      1.00      0.99     27786



#### 2.5 SVM with polynomial kernel, degree=3, c=5, coef=1

In [11]:
model_svm_poly1 = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))
    ])

model_svm_poly1.fit(X_sm, y_sm)
y_pred_svm_poly1 = model_svm_poly1.predict(X_test)

print_eval(y_pred_svm_poly1, model_svm_poly1)

Training Accuracy:  0.9987043835024833
Testing Accuracy:  0.995105448787159
[[27618    38]
 [   98    32]]
Precision: 0.457
Recall: 0.246
F-Measure: 0.320
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.46      0.25      0.32       130

    accuracy                           1.00     27786
   macro avg       0.73      0.62      0.66     27786
weighted avg       0.99      1.00      0.99     27786



#### 2.6 SVM with RBF kernel, c=0.001 and gamma=5 (running for hours with no result)

In [None]:
model_svm_rbf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
    ])

model_svm_rbf.fit(X_sm, y_sm)
y_pred_svm_rbf = model_svm_rbf.predict(X_test)

print_eval(y_pred_svm_rbf, model_svm_rbf)

### 3. Random Forest

#### 3.1 Random Forest with n_estimators=100, max_leaf_nodes=16

In [12]:
# training and predictions : random forest

model_rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, random_state=42)
model_rf.fit(X_sm, y_sm)
y_pred_rf = model_rf.predict(X_test)

print_eval(y_pred_rf, model_rf)

Training Accuracy:  0.9945630379122066
Testing Accuracy:  0.9953213848700785
[[27656     0]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           1.00     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      1.00      0.99     27786



#### 3.2 Random Forest with n_estimators=150, max_leaf_nodes=16

In [13]:
# training and predictions : random forest

model_rf1 = RandomForestClassifier(n_estimators=150, max_leaf_nodes=16, random_state=42)
model_rf1.fit(X_sm, y_sm)
y_pred_rf1 = model_rf1.predict(X_test)

print_eval(y_pred_rf1, model_rf1)

Training Accuracy:  0.9945630379122066
Testing Accuracy:  0.9953213848700785
[[27656     0]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           1.00     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      1.00      0.99     27786



#### 3.3 Random Forest with n_estimators=100, max_leaf_nodes=32

In [14]:
# training and predictions : random forest

model_rf2 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=32, random_state=42)
model_rf2.fit(X_sm, y_sm)
y_pred_rf2 = model_rf2.predict(X_test)

print_eval(y_pred_rf2, model_rf2)

Training Accuracy:  0.9945784619181294
Testing Accuracy:  0.9953213848700785
[[27656     0]
 [  130     0]]
Precision: 0.000
Recall: 0.000
F-Measure: 0.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.00      0.00      0.00       130

    accuracy                           1.00     27786
   macro avg       0.50      0.50      0.50     27786
weighted avg       0.99      1.00      0.99     27786



### 4. Decision Trees

#### 4.1 Decision tree with full max depth

In [15]:
# training and predictions : decision tree

model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_sm, y_sm)
y_pred_dt = model_dt.predict(X_test)

print_eval(y_pred_dt, model_dt)

Training Accuracy:  1.0
Testing Accuracy:  0.9966529907147484
[[27610    46]
 [   47    83]]
Precision: 0.643
Recall: 0.638
F-Measure: 0.641
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.64      0.64      0.64       130

    accuracy                           1.00     27786
   macro avg       0.82      0.82      0.82     27786
weighted avg       1.00      1.00      1.00     27786



#### 4.2 Decision tree with max_depth=16

In [16]:
# training and predictions : decision tree

model_dt1 = DecisionTreeClassifier(max_depth=16, random_state=42)
model_dt1.fit(X_sm, y_sm)
y_pred_dt1 = model_dt1.predict(X_test)

print_eval(y_pred_dt1, model_dt1)

Training Accuracy:  0.9993059197334732
Testing Accuracy:  0.9974447563521198
[[27633    23]
 [   48    82]]
Precision: 0.781
Recall: 0.631
F-Measure: 0.698
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.78      0.63      0.70       130

    accuracy                           1.00     27786
   macro avg       0.89      0.81      0.85     27786
weighted avg       1.00      1.00      1.00     27786



### 5. Gradient Boosting

In [17]:
model_gb = GradientBoostingClassifier(random_state=0)
model_gb.fit(X_sm, y_sm)

y_pred_gb = model_gb.predict(X_test)

print_eval(y_pred_gb, model_gb)


Training Accuracy:  0.9967069747354783
Testing Accuracy:  0.9969768948391277
[[27628    28]
 [   56    74]]
Precision: 0.725
Recall: 0.569
F-Measure: 0.638
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.73      0.57      0.64       130

    accuracy                           1.00     27786
   macro avg       0.86      0.78      0.82     27786
weighted avg       1.00      1.00      1.00     27786



### 6. ANN

In [18]:
# Scaling the features
scaler = StandardScaler()
scaler.fit(X_sm)
X_trans = scaler.transform(X_sm)
X_trans_test = scaler.transform(X_test)

# Applying model
model_mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
model_mlp.fit(X_trans, y_sm.values.ravel())

y_pred_mlp = model_mlp.predict(X_trans_test)
print_eval(y_pred_mlp, model_mlp)


Training Accuracy:  0.9815683129222321
Testing Accuracy:  0.983085006837976
[[27611    45]
 [  103    27]]
Precision: 0.375
Recall: 0.208
F-Measure: 0.267
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.38      0.21      0.27       130

    accuracy                           0.99     27786
   macro avg       0.69      0.60      0.63     27786
weighted avg       0.99      0.99      0.99     27786



### Results and Observations with handling Imbalance : Class 1(Fraud)

|                                       | Precision     |   Recall      |    f1-score    | 
|---------------------------------------|---------------|---------------|----------------|
|Logistic Regression                    |     0.00      |     0.00      |     0.00       | 
|SVM (Linear, c=1)                      |     0.07      |     0.01      |     0.01       | 
|SVM (Linear, c=50)                     |     0.00      |     0.00      |     0.00       | 
|SVM (Linear, c=0.001)                  |     0.00      |     0.00      |     0.00       | 
|SVM (Poly, c=1, degree=2)              |     0.68      |     0.10      |     0.17       |
|SVM (Poly, c=5, degree=3)              |     0.46      |     0.25      |     0.32       |
|SVM (RBF, c=.001, gamma=5)             |               |               |                |
|Random Forest (n_est=100, max_node=16) |     0.00      |     0.00      |     0.00       | 
|Random Forest (n_est=100, max_node=32) |     0.00      |     0.00      |     0.00       | 
|Random Forest (n_est=150, max_node=16) |     0.00      |     0.00      |     0.00       |
|Decision Tree (max_depth)              |     0.64      |     0.64      |     0.64       | 
|Decision Tree (max_depth=16)           |     0.78      |     0.63      |     0.70       |
|Gradient Boost                         |     0.73      |     0.57      |     0.64       |  
|ANN                                    |     0.38      |     0.21      |     0.27       |  
|Penalized SVM                          |               |               |                |  
