In [19]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

#### Loading Preprocessed data

In [20]:
X_train = pd.read_csv("processed_train.csv")
X_test = pd.read_csv("processed_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

In [21]:
# Dropping additional column

y_train = y_train.drop('Unnamed: 0', axis=1)
y_test = y_test.drop('Unnamed: 0', axis=1)
X_train = X_train.drop('Unnamed: 0', axis=1)
X_test = X_test.drop('Unnamed: 0', axis=1)


#### Model Evaluation Function

In [22]:
# Model Evaluation

def print_eval(y_pred, model):
    print("Training Accuracy: ", model.score(X_train, y_train))
    print("Testing Accuracy: ", model.score(X_test, y_test))
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    precision = precision_score(y_test, y_pred)
    print('Precision: %.3f' % precision)
    
    recall = recall_score(y_test, y_pred)
    print('Recall: %.3f' % recall)
    
    score = f1_score(y_test, y_pred)
    print('F-Measure: %.3f' % score)
    
    print(classification_report(y_test, y_pred))

### Over Sampling data using SMOTE

In [23]:
# smote implementation

smote = SMOTE(sampling_strategy = 'minority')
X_sm, y_sm = smote.fit_sample(X_train, y_train)
y_sm.value_counts()

is_fraud
1           128963
0           128963
dtype: int64

### Model Implementation:

>1	Logistic Regression   
2	SVM    
3	XGBoost    
4	Random Forest   
5	Artificial Neural Network   
6   Penalized-SVM 

### 1. Logistic Regression

In [6]:
# training and predictions : Logistic Regression

model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_sm, y_sm)
y_pred_lr = model_lr.predict(X_test)

print_eval(y_pred_lr, model_lr)

Training Accuracy:  0.949332140543542
Testing Accuracy:  0.9506945944000575
[[26316  1340]
 [   30   100]]
Precision: 0.069
Recall: 0.769
F-Measure: 0.127
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     27656
           1       0.07      0.77      0.13       130

    accuracy                           0.95     27786
   macro avg       0.53      0.86      0.55     27786
weighted avg       0.99      0.95      0.97     27786



### 2. SVM

#### 2.1 SVM with linear kernel and c=1

In [7]:
# training and predictions : SVM

model_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=1, loss="hinge", random_state=42)),
    ])

model_svm.fit(X_sm, y_sm)

y_pred_svm = model_svm.predict(X_test)
print_eval(y_pred_svm, model_svm)

Training Accuracy:  0.9900360921738594
Testing Accuracy:  0.9904268336572375
[[27502   154]
 [  112    18]]
Precision: 0.105
Recall: 0.138
F-Measure: 0.119
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     27656
           1       0.10      0.14      0.12       130

    accuracy                           0.99     27786
   macro avg       0.55      0.57      0.56     27786
weighted avg       0.99      0.99      0.99     27786



#### 2.2 SVM with linear kernel and c=50

In [8]:
model_svm1 = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=50, loss="hinge", random_state=42)),
    ])

model_svm1.fit(X_sm, y_sm)

y_pred_svm1 = model_svm1.predict(X_test)

print_eval(y_pred_svm1, model_svm1)

Training Accuracy:  0.9784063917080544
Testing Accuracy:  0.9815374649103865
[[27254   402]
 [  111    19]]
Precision: 0.045
Recall: 0.146
F-Measure: 0.069
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27656
           1       0.05      0.15      0.07       130

    accuracy                           0.98     27786
   macro avg       0.52      0.57      0.53     27786
weighted avg       0.99      0.98      0.99     27786



#### 2.3 SVM with linear kernel, c=0.01

In [9]:
model_svm2 = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=0.001, loss="hinge", random_state=42)),
    ])

model_svm2.fit(X_sm, y_sm)

y_pred_svm2 = model_svm2.predict(X_test)

print_eval(y_pred_svm2, model_svm2)

Training Accuracy:  0.9899281241323996
Testing Accuracy:  0.9916144821132945
[[27540   116]
 [  117    13]]
Precision: 0.101
Recall: 0.100
F-Measure: 0.100
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.10      0.10      0.10       130

    accuracy                           0.99     27786
   macro avg       0.55      0.55      0.55     27786
weighted avg       0.99      0.99      0.99     27786



#### 2.4 SVM with polynomial kernel, degree=2,  c=1 and coef=1

In [10]:
model_svm_poly = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=2, coef0=1, C=1))
    ])

model_svm_poly.fit(X_sm, y_sm)
y_pred_svm_poly = model_svm_poly.predict(X_test)

print_eval(y_pred_svm_poly, model_svm_poly)

Training Accuracy:  0.9959203504334145
Testing Accuracy:  0.9937378535953357
[[27611    45]
 [  129     1]]
Precision: 0.022
Recall: 0.008
F-Measure: 0.011
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.02      0.01      0.01       130

    accuracy                           0.99     27786
   macro avg       0.51      0.50      0.50     27786
weighted avg       0.99      0.99      0.99     27786



#### 2.5 SVM with polynomial kernel, degree=3, c=5, coef=1

In [None]:
model_svm_poly1 = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))
    ])

model_svm_poly1.fit(X_sm, y_sm)
y_pred_svm_poly1 = model_svm_poly1.predict(X_test)

print_eval(y_pred_svm_poly1, model_svm_poly1)

#### 2.6 SVM with RBF kernel, c=0.001 and gamma=5 (running for hours with no result)

In [None]:
model_svm_rbf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
    ])

model_svm_rbf.fit(X_sm, y_sm)
y_pred_svm_rbf = model_svm_rbf.predict(X_test)

print_eval(y_pred_svm_rbf, model_svm_rbf)

### 3. Random Forest

#### 3.1 Random Forest with n_estimators=100

In [14]:
# training and predictions : random forest

model_rf = RandomForestClassifier(n_estimators=150, random_state=42, max_leaf_nodes=16)
model_rf.fit(X_sm, y_sm)
y_pred_rf = model_rf.predict(X_test)

print_eval(y_pred_rf, model_rf)

Training Accuracy:  0.9821235771354536
Testing Accuracy:  0.9884834089109623
[[27386   270]
 [   50    80]]
Precision: 0.229
Recall: 0.615
F-Measure: 0.333
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27656
           1       0.23      0.62      0.33       130

    accuracy                           0.99     27786
   macro avg       0.61      0.80      0.66     27786
weighted avg       0.99      0.99      0.99     27786



In [24]:
model_rf = RandomForestClassifier(n_estimators=150, random_state=42)
model_rf.fit(X_sm, y_sm)
y_pred_rf = model_rf.predict(X_test)

print_eval(y_pred_rf, model_rf)

Training Accuracy:  1.0
Testing Accuracy:  0.9956092996473044
[[27652     4]
 [  118    12]]
Precision: 0.750
Recall: 0.092
F-Measure: 0.164
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.75      0.09      0.16       130

    accuracy                           1.00     27786
   macro avg       0.87      0.55      0.58     27786
weighted avg       0.99      1.00      0.99     27786



#### 3.2 Random Forest with n_estimators=150, max_leaf_nodes=16

In [15]:
# training and predictions : random forest

model_rf1 = RandomForestClassifier(n_estimators=150, max_leaf_nodes=16, random_state=42)
model_rf1.fit(X_sm, y_sm)
y_pred_rf1 = model_rf1.predict(X_test)

print_eval(y_pred_rf1, model_rf1)

Training Accuracy:  0.9818305210229201
Testing Accuracy:  0.988267472828043
[[27381   275]
 [   51    79]]
Precision: 0.223
Recall: 0.608
F-Measure: 0.326
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27656
           1       0.22      0.61      0.33       130

    accuracy                           0.99     27786
   macro avg       0.61      0.80      0.66     27786
weighted avg       0.99      0.99      0.99     27786



#### 3.3 Random Forest with n_estimators=100, max_leaf_nodes=32

In [16]:
# training and predictions : random forest

model_rf2 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=32, random_state=42)
model_rf2.fit(X_sm, y_sm)
y_pred_rf2 = model_rf2.predict(X_test)

print_eval(y_pred_rf2, model_rf2)

Training Accuracy:  0.9866659468797236
Testing Accuracy:  0.9903908443100842
[[27443   213]
 [   54    76]]
Precision: 0.263
Recall: 0.585
F-Measure: 0.363
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     27656
           1       0.26      0.58      0.36       130

    accuracy                           0.99     27786
   macro avg       0.63      0.79      0.68     27786
weighted avg       0.99      0.99      0.99     27786



### 4. Decision Trees

#### 4.1 Decision tree with full max depth

In [21]:
# training and predictions : decision tree

model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_sm, y_sm)
y_pred_dt = model_dt.predict(X_test)

print_eval(y_pred_dt, model_dt)

Training Accuracy:  1.0
Testing Accuracy:  0.9925502051392787
[[27530   126]
 [   81    49]]
Precision: 0.280
Recall: 0.377
F-Measure: 0.321
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.28      0.38      0.32       130

    accuracy                           0.99     27786
   macro avg       0.64      0.69      0.66     27786
weighted avg       0.99      0.99      0.99     27786



#### 4.2 Decision tree with max_depth=16

In [22]:
# training and predictions : decision tree

model_dt1 = DecisionTreeClassifier(max_depth=16, random_state=42)
model_dt1.fit(X_sm, y_sm)
y_pred_dt1 = model_dt1.predict(X_test)

print_eval(y_pred_dt1, model_dt1)

Training Accuracy:  0.980396088472098
Testing Accuracy:  0.9789102425681998
[[27101   555]
 [   31    99]]
Precision: 0.151
Recall: 0.762
F-Measure: 0.253
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     27656
           1       0.15      0.76      0.25       130

    accuracy                           0.98     27786
   macro avg       0.58      0.87      0.62     27786
weighted avg       0.99      0.98      0.99     27786



### 5. Gradient Boosting

In [24]:
model_gb = GradientBoostingClassifier(random_state=0)
model_gb.fit(X_sm, y_sm)

y_pred_gb = model_gb.predict(X_test)

print_eval(y_pred_gb, model_gb)


Training Accuracy:  0.983658265724774
Testing Accuracy:  0.9885553876052688
[[27376   280]
 [   38    92]]
Precision: 0.247
Recall: 0.708
F-Measure: 0.367
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27656
           1       0.25      0.71      0.37       130

    accuracy                           0.99     27786
   macro avg       0.62      0.85      0.68     27786
weighted avg       1.00      0.99      0.99     27786



### 6. ANN

In [19]:
# Scaling the features
scaler = StandardScaler()
scaler.fit(X_sm)
X_trans = scaler.transform(X_sm)
X_trans_test = scaler.transform(X_test)

# Applying model
model_mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
model_mlp.fit(X_trans, y_sm.values.ravel())

y_pred_mlp = model_mlp.predict(X_trans_test)
print_eval(y_pred_mlp, model_mlp)


Training Accuracy:  0.022696424715427092
Testing Accuracy:  0.022025480457784494
[[27609    47]
 [  124     6]]
Precision: 0.113
Recall: 0.046
F-Measure: 0.066
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27656
           1       0.11      0.05      0.07       130

    accuracy                           0.99     27786
   macro avg       0.55      0.52      0.53     27786
weighted avg       0.99      0.99      0.99     27786



In [None]:
# Scaling the features
scaler = StandardScaler()
scaler.fit(X_sm)
X_trans = scaler.transform(X_sm)
X_trans_test = scaler.transform(X_test)

# Applying model
model_mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
model_mlp.fit(X_trans, y_sm.values.ravel())

y_pred_mlp = model_mlp.predict(X_trans_test)
print_eval(y_pred_mlp, model_mlp)


### Results and Observations :

|                                       | Precision     |   Recall      |    f1-score    | 
|---------------------------------------|---------------|---------------|----------------|
|Logistic Regression                    |     0.07      |     0.77      |     0.13       | 
|SVM (Linear, c=1)                      |     0.10      |     0.14      |     0.12       | 
|SVM (Linear, c=50)                     |     0.05      |     0.15      |     0.07       | 
|SVM (Linear, c=0.001)                  |     0.10      |     0.10      |     0.10       | 
|SVM (Poly, c=1, degree=2)              |     0.02      |     0.01      |     0.01       |
|SVM (Poly, c=5, degree=3)              |     0.159     |     0.085     |     0.111      |
|SVM (RBF, c=.001, gamma=5)             |               |               |                |
|Random Forest (n_est=100, max_node=16) |     0.23      |     0.62      |     0.33       | 
|Random Forest (n_est=100, max_node=32) |     0.26      |     0.58      |     0.36       | 
|Random Forest (n_est=150, max_node=16) |     0.22      |     0.61      |     0.33       |
|Decision Tree (max_depth)              |     0.28      |     0.38      |     0.32       | 
|Decision Tree (max_depth=16)           |     0.15      |     0.76      |     0.25       |
|Gradient Boost                         |     0.25      |     0.71      |     0.37       |  
|ANN                                    |     0.11      |     0.05      |     0.07       |  
|Penalized SVM                          |     0.96      |     0.78      |     0.98       |  
