In [9]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings('ignore')

#### Loading Preprocessed data

In [2]:
X_train = pd.read_csv("processed_train.csv")
X_test = pd.read_csv("processed_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

In [3]:
# Dropping additional column

y_train = y_train.drop('Unnamed: 0', axis=1)
y_test = y_test.drop('Unnamed: 0', axis=1)
X_train = X_train.drop('Unnamed: 0', axis=1)
X_test = X_test.drop('Unnamed: 0', axis=1)


#### Model Evaluation Function

In [4]:
# Model Evaluation

def print_eval(y_pred, model):
    print("Training Accuracy: ", model.score(X_train, y_train))
    print("Testing Accuracy: ", model.score(X_test, y_test))
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    precision = precision_score(y_test, y_pred)
    print('Precision: %.3f' % precision)
    
    recall = recall_score(y_test, y_pred)
    print('Recall: %.3f' % recall)
    
    score = f1_score(y_test, y_pred)
    print('F-Measure: %.3f' % score)
    
    #print(classification_report(y_test, y_pred))

### Over Sampling data using SMOTE

In [5]:
# smote implementation

smote = SMOTE(sampling_strategy = 'minority')
X_sm, y_sm = smote.fit_sample(X_train, y_train)
y_sm.value_counts()

is_fraud
1           128963
0           128963
dtype: int64

### Model Implementation:

>1	Logistic Regression   
2	SVM    
3	XGBoost    
4	Random Forest   
5	Artificial Neural Network   
6   Penalized-SVM 

### 1. Logistic Regression

In [6]:
# training and predictions : Logistic Regression

model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_sm, y_sm)
y_pred_lr = model_lr.predict(X_test)

print_eval(y_pred_lr, model_lr)

Training Accuracy:  0.9495018046086929
Testing Accuracy:  0.9508385517886706
[[26320  1336]
 [   30   100]]
Precision: 0.070
Recall: 0.769
F-Measure: 0.128


### 2. SVM

In [10]:
# training and predictions : SVM

model_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=1, loss="hinge", random_state=42)),
    ])

In [11]:
model_svm.fit(X_sm, y_sm)

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge', random_state=42))])

In [12]:
y_pred_svm = model_svm.predict(X_test)

In [13]:
print_eval(y_pred_svm, model_svm)

Training Accuracy:  0.9902751642656631
Testing Accuracy:  0.9907507377816166
[[27511   145]
 [  112    18]]
Precision: 0.110
Recall: 0.138
F-Measure: 0.123


#### SVM tuning 1

In [15]:
model_svm1 = Pipeline([
        ("scaler", StandardScaler()),
        ("linear_svc", LinearSVC(C=100, loss="hinge", random_state=42)),
    ])


In [16]:
model_svm1.fit(X_sm, y_sm)

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc',
                 LinearSVC(C=100, loss='hinge', random_state=42))])

In [17]:
y_pred_svm1 = model_svm1.predict(X_test)

print_eval(y_pred_svm1, model_svm1)

Training Accuracy:  0.9763704229262424
Testing Accuracy:  0.9778665515007557
[[27154   502]
 [  113    17]]
Precision: 0.033
Recall: 0.131
F-Measure: 0.052


#### SVM tuning 2

In [19]:
model_svm_poly = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))
    ])

model_svm_poly.fit(X_sm, y_sm)
y_pred_svm_poly = model_svm_poly.predict(X_test)

print_eval(y_pred_svm_poly, model_svm_poly)

Training Accuracy:  0.9990359996298238
Testing Accuracy:  0.9936298855538761
[[27598    58]
 [  119    11]]
Precision: 0.159
Recall: 0.085
F-Measure: 0.111


#### SVM Tuning 3

In [None]:
model_svm_rbf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
    ])

model_svm_rbf.fit(X_sm, y_sm)
y_pred_svm_rbf = model_svm_rbf.predict(X_test)

print_eval(y_pred_svm_rbf, model_svm_rbf)

### Results and Observations :

|                                       | Precision     |   Recall      |    f1-score    | 
|---------------------------------------|---------------|---------------|----------------|
|Logistic Regression                    |     0.070     |     0.769     |     0.128      | 
|SVM (Linear, c=1)                      |     0.110     |     0.138     |     0.123      | 
|SVM (Linear, c=100)                    |     0.033     |     0.131     |     0.052      | 
|SVM (Poly, c=5, degree=3)              |     0.159     |     0.085     |     0.111      |
|SVM (RBF, c=.001, gamma=5)             |     0.033     |     0.131     |     0.052      |
|Random Forest Classifier               |     1.00      |     0.00      |     1.00       |  
|Decision Tree                          |     0.96      |     0.77      |     0.98       | 
|XG Boost                               |     0.93      |     0.95      |     0.96       |  
|ANN                                    |     0.96      |     0.78      |     0.98       |  
|Penalized SVM                          |     0.96      |     0.78      |     0.98       |  


In [None]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(Xs)
svm_clf.fit(X_scaled, ys)