# Baseline model

Split the dataset

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [59]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [60]:
from xverse.feature_subset import SplitXY

def splitXY(df, target_column):

    clf = SplitXY([target_column]) #Split the dataset into X and y
    X, y = clf.fit_transform(df) #returns features (X) dataset and target(Y) as a numpy array
    return X, y

def make_train_test_split(X=None, y=None, test_size=0.3, random_state=1234):

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [61]:
selected_vars = ['V11', 'V12', 'V3', 'V19', 'V7', 'V14', 'V1', 'V4', 'V21', 'V2', 'V18', 'V16', 'V17', 'V9', 'V10']

# Build logistic regression model

In [62]:
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [63]:
def build_model(X, y):

    clf = LogisticRegression()
    clf.fit(X, y)
    return clf

In [64]:
def evaluate_model(clf, X, y):
    
    pred = clf.predict(X) #predicted classes
    accuracy = accuracy_score(pred,y) # calculate accuracy
    fpr, tpr, _ = roc_curve(y, clf.predict_proba(X)[:,1]) # roc_curve
    auc_value = auc(fpr,tpr) # auc_value
    report = classification_report(y, pred, labels=[0,1], output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df = report_df.reset_index()
    model_eval  = report_df[report_df['index'].str.contains('1')][['precision','recall','f1-score']]
    model_eval['accuracy']  = list(report_df[report_df['index'].str.contains('accuracy')]['support'])
    model_eval['ROC']  = auc_value
    cf_matrix = confusion_matrix(y, pred)
    
    return model_eval, cf_matrix

In [65]:
def model_eval_data(clf, X_train, y_train, 
                         X_test, y_test, 
                         model_eval_train, 
                         model_eval_test,
                         Name=None):
    
    temp_eval_train, cf_matrix_train = evaluate_model(clf, X_train, y_train)
    temp_eval_test, cf_matrix_test = evaluate_model(clf, X_test, y_test)
    temp_eval_train.index = [Name]
    temp_eval_test.index = [Name]
    
    try:
        model_eval_train = model_eval_train.append(temp_eval_train)
        model_eval_test = model_eval_test.append(temp_eval_test)
    except:
        model_eval_train = temp_eval_train
        model_eval_test = temp_eval_test
    
    return model_eval_train, model_eval_test, cf_matrix_train, cf_matrix_test

In [66]:
def make_confusion_matrix_chart(cf_matrix_train, cf_matrix_test):
    
    plt.figure(1, figsize=(10,5))

    plt.subplot(121)
    sns.heatmap(cf_matrix_train, annot=True, yticklabels=['Not Fraud', 'Fraud'], 
                                xticklabels=['Not Fraud', 'Fraud'], fmt='g')
    plt.ylabel("Actual")
    plt.xlabel("Pred")
    plt.ylim([0,2])
    plt.title('Train data')

    plt.subplot(122)
    sns.heatmap(cf_matrix_test, annot=True, yticklabels=['Not Fraud', 'Fraud'], 
                                xticklabels=['Not Fraud', 'Fraud'], fmt='g')
    plt.ylabel("Actual")
    plt.xlabel("Pred")
    plt.ylim([0,2])
    plt.tight_layout()
    plt.title('Test data')

    plt.tight_layout()
    return None

In [67]:
model_eval_train = pd.DataFrame({},[])
model_eval_test = pd.DataFrame({},[])

Using imbalanced data

In [68]:
X, y = splitXY(df, 'Class')
X_train, X_test, y_train, y_test = make_train_test_split(X=X, y=y, test_size=0.3, random_state=1)

In [69]:
X_train = X_train[selected_vars] #select the variables from variable selection proces
X_test = X_test[selected_vars]

In [70]:
clf = build_model(X_train, y_train)

In [71]:
model_eval_train, model_eval_test, cf_matrix_train, cf_matrix_test = model_eval_data(clf, X_train, y_train, 
                                                         X_test, y_test, 
                                                         model_eval_train, 
                                                         model_eval_test,
                                                         Name='imbalanced data - LogisticRegression')

In [72]:
model_eval_train

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.897638,0.638655,0.746318,0.999223,0.974718


In [73]:
model_eval_test

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.846154,0.57037,0.681416,0.999157,0.96878


Using imbalanced data without feature selection

In [74]:
X, y = splitXY(df, 'Class')
X_train, X_test, y_train, y_test = make_train_test_split(X=X, y=y, test_size=0.3, random_state=1)

In [75]:
clf1 = build_model(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [76]:
model_eval_train, model_eval_test, cf_matrix_train, cf_matrix_test = model_eval_data(clf1, X_train, y_train, 
                                                         X_test, y_test, 
                                                         model_eval_train, 
                                                         model_eval_test,
                                                         Name='imbalanced data - LogisticRegression(without FS)')

In [77]:
model_eval_train

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.897638,0.638655,0.746318,0.999223,0.974718
imbalanced data - LogisticRegression(without FS),0.719764,0.683473,0.701149,0.998957,0.933974


In [78]:
model_eval_test

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.846154,0.57037,0.681416,0.999157,0.96878
imbalanced data - LogisticRegression(without FS),0.705882,0.622222,0.661417,0.998993,0.881542


This is the baseline model using imbalanced data and simple logistic regression.

Then I will use the first sampling techniques on it: Random oversampling

In [79]:
from imblearn.over_sampling import RandomOverSampler

In [80]:
X, y = splitXY(df, 'Class')
X_train, X_test, y_train, y_test = make_train_test_split(X=X, y=y, test_size=0.3, random_state=1)
X_train = X_train[selected_vars] #select the variables from variable selection proces
X_test = X_test[selected_vars]

In [81]:
X_resampled, y_resampled = RandomOverSampler().fit_sample(X_train, y_train)

In [82]:
clf2 = build_model(X_resampled, y_resampled)

In [83]:
model_eval_train, model_eval_test, cf_matrix_train, cf_matrix_test = model_eval_data(clf2, X_train, y_train, 
                                                         X_test, y_test, 
                                                         model_eval_train, 
                                                         model_eval_test,
                                                         Name='RandomOversampling data')

In [84]:
model_eval_train

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.897638,0.638655,0.746318,0.999223,0.974718
imbalanced data - LogisticRegression(without FS),0.719764,0.683473,0.701149,0.998957,0.933974
RandomOversampling data,0.084359,0.901961,0.154288,0.982294,0.979857


In [85]:
model_eval_test

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.846154,0.57037,0.681416,0.999157,0.96878
imbalanced data - LogisticRegression(without FS),0.705882,0.622222,0.661417,0.998993,0.881542
RandomOversampling data,0.073585,0.866667,0.135652,0.98255,0.981267


# Smote-logisticregression

In [86]:
from imblearn.over_sampling import SMOTE

In [87]:
X, y = splitXY(df, 'Class')
X_train, X_test, y_train, y_test = make_train_test_split(X=X, y=y, test_size=0.3, random_state=1)
X_train = X_train[selected_vars] #select the variables from variable selection proces
X_test = X_test[selected_vars]

In [88]:
smote = SMOTE(random_state=5)
X_sm, y_sm = smote.fit_resample(X_train, y_train)

In [89]:
clf3 = build_model(X_sm, y_sm)

In [90]:
model_eval_train, model_eval_test, cf_matrix_train, cf_matrix_test = model_eval_data(clf3, X_train, y_train, 
                                                         X_test, y_test, 
                                                         model_eval_train, 
                                                         model_eval_test,
                                                         Name='SMOTE - LogisticRegression')

In [91]:
model_eval_train

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.897638,0.638655,0.746318,0.999223,0.974718
imbalanced data - LogisticRegression(without FS),0.719764,0.683473,0.701149,0.998957,0.933974
RandomOversampling data,0.084359,0.901961,0.154288,0.982294,0.979857
SMOTE - LogisticRegression,0.074142,0.907563,0.137085,0.97954,0.978213


In [92]:
model_eval_test

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.846154,0.57037,0.681416,0.999157,0.96878
imbalanced data - LogisticRegression(without FS),0.705882,0.622222,0.661417,0.998993,0.881542
RandomOversampling data,0.073585,0.866667,0.135652,0.98255,0.981267
SMOTE - LogisticRegression,0.064498,0.866667,0.120062,0.979928,0.978068


Borderline-Smote+Logistic Regression

In [93]:
from imblearn.over_sampling import BorderlineSMOTE

In [94]:
X, y = splitXY(df, 'Class')
X_train, X_test, y_train, y_test = make_train_test_split(X=X, y=y, test_size=0.3, random_state=1)
X_train = X_train[selected_vars] #select the variables from variable selection proces
X_test = X_test[selected_vars]

In [95]:
bsm = BorderlineSMOTE(random_state=1,kind="borderline-1")

In [96]:
X_bsm, y_bsm = bsm.fit_resample(X_train, y_train)

In [97]:
clf5 = build_model(X_bsm, y_bsm)

In [98]:
model_eval_train, model_eval_test, cf_matrix_train, cf_matrix_test = model_eval_data(clf5, X_train, y_train, 
                                                         X_test, y_test, 
                                                         model_eval_train, 
                                                         model_eval_test,
                                                         Name='Borderline-SMOTE - LogisticRegression')

In [99]:
model_eval_train

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.897638,0.638655,0.746318,0.999223,0.974718
imbalanced data - LogisticRegression(without FS),0.719764,0.683473,0.701149,0.998957,0.933974
RandomOversampling data,0.084359,0.901961,0.154288,0.982294,0.979857
SMOTE - LogisticRegression,0.074142,0.907563,0.137085,0.97954,0.978213
Borderline-SMOTE - LogisticRegression,0.172622,0.879552,0.288603,0.992235,0.948088


In [100]:
model_eval_test

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.846154,0.57037,0.681416,0.999157,0.96878
imbalanced data - LogisticRegression(without FS),0.705882,0.622222,0.661417,0.998993,0.881542
RandomOversampling data,0.073585,0.866667,0.135652,0.98255,0.981267
SMOTE - LogisticRegression,0.064498,0.866667,0.120062,0.979928,0.978068
Borderline-SMOTE - LogisticRegression,0.147175,0.82963,0.25,0.992135,0.93341


ADASYN

In [101]:
from imblearn.over_sampling import ADASYN 

In [102]:
X, y = splitXY(df, 'Class')
X_train, X_test, y_train, y_test = make_train_test_split(X=X, y=y, test_size=0.3, random_state=1)
X_train = X_train[selected_vars] #select the variables from variable selection proces
X_test = X_test[selected_vars]

In [103]:
ada = ADASYN()
adaX, aday = ada.fit_sample(X_train, y_train)

In [104]:
clf4 = build_model(adaX, aday)

In [105]:
model_eval_train, model_eval_test, cf_matrix_train, cf_matrix_test = model_eval_data(clf4, X_train, y_train, 
                                                         X_test, y_test, 
                                                         model_eval_train, 
                                                         model_eval_test,
                                                         Name='ADASYN LogisticRegression')

In [106]:
model_eval_train

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.897638,0.638655,0.746318,0.999223,0.974718
imbalanced data - LogisticRegression(without FS),0.719764,0.683473,0.701149,0.998957,0.933974
RandomOversampling data,0.084359,0.901961,0.154288,0.982294,0.979857
SMOTE - LogisticRegression,0.074142,0.907563,0.137085,0.97954,0.978213
Borderline-SMOTE - LogisticRegression,0.172622,0.879552,0.288603,0.992235,0.948088
ADASYN LogisticRegression,0.016234,0.952381,0.031923,0.896566,0.980445


In [107]:
model_eval_test

Unnamed: 0,precision,recall,f1-score,accuracy,ROC
imbalanced data - LogisticRegression,0.846154,0.57037,0.681416,0.999157,0.96878
imbalanced data - LogisticRegression(without FS),0.705882,0.622222,0.661417,0.998993,0.881542
RandomOversampling data,0.073585,0.866667,0.135652,0.98255,0.981267
SMOTE - LogisticRegression,0.064498,0.866667,0.120062,0.979928,0.978068
Borderline-SMOTE - LogisticRegression,0.147175,0.82963,0.25,0.992135,0.93341
ADASYN LogisticRegression,0.014506,0.962963,0.028581,0.896574,0.982764
