In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from datetime import datetime

In [3]:
# This turns all the axes white in all the matplotlib plots. Comment this out if you dont want that
COLOR = 'white'
matplotlib.rcParams['text.color'] = COLOR
matplotlib.rcParams['axes.labelcolor'] = COLOR
matplotlib.rcParams['xtick.color'] = COLOR
matplotlib.rcParams['ytick.color'] = COLOR

In [4]:
df = pd.read_csv('/Users/nadeem/datasets/ieee-fraud-detection/train_transaction.csv')
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# In the case of a SVM, we will try to split the cloud of 0's and 1's.

features = ['isFraud', 'TransactionDT',
            'TransactionAmt','ProductCD', 'P_emaildomain','R_emaildomain']
df = df[features]
df.head()

target = 'isFraud'
cat = ['TransactionDT','ProductCD', 'P_emaildomain','R_emaildomain']
num = ['TransactionAmt']

df = df.dropna()
y = df[target].values

x_cat = df.filter(items = cat).values
x_num = df.filter(items = num).values

labelencoder_X = LabelEncoder()
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])
    
X = np.concatenate((x_cat, x_num), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y) 

In [6]:
# feature scaling - prepares training and test data for ML by standardizing features to have similar scales
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [7]:
# creating instance of Support Vector Classification
# C compensates between the width of the margins and classification error.
# Large values of C create result in narrow margins and more strict classification rules.
# C and random_state are hyperparamters
svc_linear = SVC(C=1.0, random_state=1, kernel='linear')

In [8]:
# It takes so long to train these models
svc_linear.fit(X_train_std, y_train)

SVC(kernel='linear', random_state=1)

In [9]:
y_predict_linear = svc_linear.predict(X_test_std)

In [10]:
print("Accuracy score %.3f" %metrics.accuracy_score(y_test, y_predict_linear))

Accuracy score 0.916


In [11]:
from sklearn.metrics import confusion_matrix

In [13]:
cm = confusion_matrix(y_test, y_predict_linear)
print(cm)

[[23131     0]
 [ 2115     0]]


In [None]:
# These results are very bad. 
# The general structure of a confusion matrix by rows is [[TN FP]
#                                                         [FN TP]]
# All the negative are correctly identified and the all positives are falsely identified. 
# Our model is very biased towards the negative class.

In [11]:
# other kernels take too long - over 8 minutes and still not done!
svc_rbf = SVC(C=1.0, random_state=1, kernel='rbf', gamma = 0.1)

In [12]:
svc_rbf.fit(X_train_std, y_train)

In [None]:
y_predict_poly = svc_rbf.predict(X_test_std)

In [None]:
print("Accuracy score %.3f" %metrics.accuracy_score(y_test, y_predict_poly))