In [2]:
from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("data/creditcard.csv")
df.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
2464,2029.0,-12.168192,-15.732974,-0.376474,3.792613,10.658654,-7.465603,-6.907038,1.573722,0.058164,...,1.660209,-0.910516,0.010468,-0.097246,-0.329918,0.225916,0.201802,-2.368534,120.0,0
230529,146371.0,2.024288,0.503922,-2.514924,1.235673,1.367503,-0.322007,0.593469,-0.182795,-0.178209,...,-0.015941,0.153533,-0.075635,0.02544,0.539439,-0.508568,-0.002293,-0.036544,8.01,0
183400,125781.0,1.961015,-0.014702,-1.914611,0.434157,0.360263,-1.033647,0.268387,-0.327631,0.32157,...,0.196284,0.541717,-0.005112,0.71322,0.081817,0.639196,-0.078138,-0.023205,66.83,0
22900,32493.0,1.289035,-0.073481,-1.224378,-0.532535,2.081643,3.232921,-0.441659,0.774688,-0.090943,...,-0.248104,-0.900271,0.032052,1.006644,0.467057,0.187175,-0.036987,0.011113,19.3,0
4534,3850.0,1.047867,-0.233352,1.347823,1.825215,-0.754861,0.922311,-0.87765,0.471386,2.593812,...,-0.541472,-0.962372,0.112892,-0.093088,0.304048,-0.558088,0.048001,0.006814,5.83,0


In [4]:
corr_values = df.corr()['Class'].drop('Class')
corr_values

Time     -0.012323
V1       -0.101347
V2        0.091289
V3       -0.192961
V4        0.133447
V5       -0.094974
V6       -0.043643
V7       -0.187257
V8        0.019875
V9       -0.097733
V10      -0.216883
V11       0.154876
V12      -0.260593
V13      -0.004570
V14      -0.302544
V15      -0.004223
V16      -0.196539
V17      -0.326481
V18      -0.111485
V19       0.034783
V20       0.020090
V21       0.040413
V22       0.000805
V23      -0.002685
V24      -0.007221
V25       0.003308
V26       0.004455
V27       0.017580
V28       0.009536
Amount    0.005632
Name: Class, dtype: float64

In [32]:
df_filter = df.filter(items=['V17', 'V14', 'V12', 'V10', 'V16', 'V3','Class'])
df_filter

Unnamed: 0,V17,V14,V12,V10,V16,V3,Class
0,0.244863,-0.324610,-0.618296,0.083386,-0.536833,1.672773,0
1,-0.135170,-0.149982,1.066089,-0.153350,0.529434,0.109797,0
2,1.306868,-0.173114,0.066137,0.190700,-3.298235,1.169468,0
3,-0.805445,-0.300360,0.178371,-0.050468,-1.209296,1.182516,0
4,-0.279081,-1.168034,0.538627,0.691625,-0.515205,1.021412,0
...,...,...,...,...,...,...,...
284802,2.344999,4.826801,2.714113,4.000715,1.264067,-6.486245,0
284803,-0.030251,-0.704305,0.916535,-0.896292,-0.812275,1.342145,0
284804,0.369114,-0.532657,0.063169,-0.445225,0.160589,-2.143205,0
284805,0.600385,0.469046,-0.963657,-0.366558,-0.694523,0.463320,0


In [6]:
def data_preprocessing_pipeline(df):
    # standardize features by removing the mean and scaling to unit variance
    df.iloc[:, 1:30] = StandardScaler().fit_transform(df.iloc[:, 1:30])
    data_matrix = df.values

    # X: feature matrix (for this analysis, we exclude the Time variable from the dataset)
    X = data_matrix[:, 1:30]

    # y: labels vector
    y = data_matrix[:, 30]

    # data normalization
    X = normalize(X, norm="l1")
    return X, y

In [None]:
# v2 works with the filtered dataframe

In [36]:
def data_preprocessing_pipeline2(df):
    # standardize features by removing the mean and scaling to unit variance
    df.iloc[:, 0:6] = StandardScaler().fit_transform(df.iloc[:, 0:6])
    data_matrix = df.values

    # X: feature matrix (for this analysis, we exclude the Time variable from the dataset)
    X = data_matrix[:, 0:6]

    # y: labels vector
    y = data_matrix[:, 6]

    # data normalization
    X = normalize(X, norm="l1")
    return X, y

In [7]:
X, y = data_preprocessing_pipeline(df)

In [37]:
X, y = data_preprocessing_pipeline2(df_filter)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# svm 1

In [9]:
# for reproducible output across multiple function calls, set random_state to a given integer value
svm = LinearSVC(class_weight='balanced', random_state=31, loss="hinge", fit_intercept=False)

svm.fit(X_train, y_train)

In [10]:
y_pred_svm = svm.decision_function(X_test) # raw score, positive class 1 otherwise class 0

In [11]:
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)
print("SVM ROC-AUC score: {0:.3f}".format(roc_auc_svm))

SVM ROC-AUC score: 0.986


In [None]:
# svm 2

In [39]:
# for reproducible output across multiple function calls, set random_state to a given integer value
svm = LinearSVC(class_weight='balanced', random_state=31, loss="hinge", fit_intercept=False)

svm.fit(X_train, y_train)

In [40]:
y_pred_svm = svm.decision_function(X_test) # raw score, positive class 1 otherwise class 0

In [42]:
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)
print("SVM ROC-AUC score: {0:.3f}".format(roc_auc_svm))

SVM ROC-AUC score: 0.937


In [None]:
# decision tree 1

In [13]:
w_train = compute_sample_weight('balanced', y_train)

In [14]:
# for reproducible output across multiple function calls, set random_state to a given integer value
dt = DecisionTreeClassifier(max_depth=4, random_state=35)

dt.fit(X_train, y_train, sample_weight=w_train)


In [21]:
y_pred_dt = dt.predict_proba(X_test)[:,1]

In [16]:
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)
print('Decision Tree ROC-AUC score : {0:.3f}'.format(roc_auc_dt))

Decision Tree ROC-AUC score : 0.939


In [None]:
# decision tree 2

In [43]:
# for reproducible output across multiple function calls, set random_state to a given integer value
dt = DecisionTreeClassifier(max_depth=4, random_state=35)

dt.fit(X_train, y_train, sample_weight=w_train)

In [45]:
y_pred_dt = dt.predict_proba(X_test)[:,1]

In [46]:
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)
print('Decision Tree ROC-AUC score : {0:.3f}'.format(roc_auc_dt))

Decision Tree ROC-AUC score : 0.952


In [None]:
# Conclusion
# support vector machines (svms) require a larger feature set to make accurate classifications 
# as its ROC curve label dropped from 0.986 to 0.937 working with subset of features 

# In contrast decision trees work better with a subset of the dataset as its ROC curve label rose from 0.939 to 0.952
# Overall support vector machines work best for this dataset given the large number of dimensions (30 features) and its
# ability to assign weights to individual class labels allowed it handle class imbalances in the dataset 
# where it was able to assign accurate weights to the minority class. 