# ALL IMPORT STATEMENT

In [4]:
import warnings
warnings.filterwarnings("ignore")



import numpy as np
import pandas as pd
from numpy import linalg
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import scipy.io as spio
import matplotlib.pyplot as plt


# FUNCTION TO CALCULATE CONFUSING MATRIX, ACCURACY AND FM

In [5]:
def confusionMatrix(y_actual, y_predicted):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for i in range(len(y_actual)):
        if y_actual[i] > 0:
            if y_actual[i] == y_predicted[i]:
                tp = tp + 1
            else:
                fn = fn + 1
        if y_actual[i] < 1:
            if y_actual[i] == y_predicted[i]:
                tn = tn + 1
            else:
                fp = fp + 1
                
    cm = [[tn, fp], [fn, tp]]
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    sens = tp/(tp+fn)
    prec = tp/(tp+fp)
    fm = (2*prec*sens)/(prec+sens)
    return cm, accuracy, fm

# FUNCTION FOR EACH SVM KERNEL

In [6]:
def linear_kernel(x1, x2):
    return np.dot(x1, x2)
    
def polynomial_kernel(x, y, p=3):
    return (1 + np.dot(x, y)) ** p

def gaussian_kernel(x, y, sigma=5.0):
    numerator = np.linalg.norm(x-y)**2
    denominator = 2 * (sigma ** 2)
    return np.exp(-numerator / denominator)

# SVM CLASS WITH TRAIN AND PREDICT FUNCTION

In [7]:
class SVM(object):

    def __init__(self, kernel=linear_kernel, tol=1e-3, C=0.1,
                 max_passes=5, sigma=0.1):

        self.kernel = kernel
        self.tol = tol
        self.C = C
        self.max_passes = max_passes
        self.sigma = sigma
        self.model = dict()

    def __repr__(self):
        return (f"{self.__class__.__name__}("
                f"kernel={self.kernel.__name__}, "
                f"tol={self.tol}, "
                f"C={self.C}, "
                f"max_passes={self.max_passes}, "
                f"sigma={self.sigma}"
                ")")

    def svmTrain(self, X, Y):
        # Data parameters
        m = X.shape[0]

        # Map 0 to -1
        Y = np.where(Y == 0, -1, 1)

        # Variables
        alphas = np.zeros((m, 1), dtype=float)
        b = 0.0
        E = np.zeros((m, 1), dtype=float)
        passes = 0

        # Pre-compute the kernel matrix
        if self.kernel.__name__ == 'linear_kernel':
            print(f'Pre-computing {self.kernel.__name__} kernel matrix')
            K = X @ X.T

        elif self.kernel.__name__ == 'gaussian_kernel':
            print(f'Pre-computing {self.kernel.__name__} kernel matrix')
            X2 = np.sum(np.power(X, 2), axis=1).reshape(-1, 1)
            K = X2 + (X2.T - (2 * (X @ X.T)))
            K = np.power(self.kernel(1, 0, self.sigma), K)

        else:
            # Pre-compute the Kernel Matrix
            # The following can be slow due to lack of vectorization
            print(f'Pre-computing {self.kernel.__name__} kernel matrix')
            K = np.zeros((m, m))

            for i in range(m):
                for j in range(m):
                    x1 = np.transpose(X[i, :])
                    x2 = np.transpose(X[j, :])
                    K[i, j] = self.kernel(x1, x2)
                    K[i, j] = K[j, i]

        print('Training...')
        print('This may take 1 to 2 minutes')

        while passes < self.max_passes:
            num_changed_alphas = 0

            for i in range(m):

                E[i] = b + np.sum(alphas * Y * K[:, i].reshape(-1, 1)) - Y[i]

                if (Y[i] * E[i] < -self.tol and alphas[i] < self.C) or (Y[i] * E[i] > self.tol and alphas[i] > 0):
                    j = np.random.randint(0, m)
                    while j == i:
                        # make sure i is not equal to j
                        j = np.random.randint(0, m)

                    E[j] = b + np.sum(alphas * Y *
                                      K[:, j].reshape(-1, 1)) - Y[j]

                    # Save old alphas
                    alpha_i_old = alphas[i, 0]
                    alpha_j_old = alphas[j, 0]

                    # Compute L and H by (10) or (11)
                    if Y[i] == Y[j]:
                        L = max(0, alphas[j] + alphas[i] - self.C)
                        H = min(self.C, alphas[j] + alphas[i])
                    else:
                        L = max(0, alphas[j] - alphas[i])
                        H = min(self.C, self.C + alphas[j] - alphas[i])
                    if L == H:
                        # continue to next i
                        continue

                    # compute eta by (14)
                    eta = 2 * K[i, j] - K[i, i] - K[j, j]
                    if eta >= 0:
                        # continue to next i
                        continue

                    # compute and clip new value for alpha j using (12) and (15)
                    alphas[j] = alphas[j] - (Y[j] * (E[i] - E[j])) / eta

                    # Clip
                    alphas[j] = min(H, alphas[j])
                    alphas[j] = max(L, alphas[j])

                    # Check if change in alpha is significant
                    if np.abs(alphas[j] - alpha_j_old) < self.tol:
                        # continue to the next i
                        # replace anyway
                        alphas[j] = alpha_j_old
                        continue

                    # Determine value for alpha i using (16)
                    alphas[i] = alphas[i] + Y[i] * \
                        Y[j] * (alpha_j_old - alphas[j])

                    # Compute b1 and b2 using (17) and (18) respectively.
                    b1 = b - E[i] - Y[i] * (alphas[i] - alpha_i_old) * \
                        K[i, j] - Y[j] * (alphas[j] - alpha_j_old) * K[i, j]

                    b2 = b - E[j] - Y[i] * (alphas[i] - alpha_i_old) * \
                        K[i, j] - Y[j] * (alphas[j] - alpha_j_old) * K[j, j]

                    # Compute b by (19).
                    if 0 < alphas[i] < self.C:
                        b = b1
                    elif 0 < alphas[j] < self.C:
                        b = b2
                    else:
                        b = (b1 + b2) / 2
                    num_changed_alphas = num_changed_alphas + 1

            if num_changed_alphas == 0:
                passes = passes + 1
            else:
                passes = 0

            print('.', end='', flush=True)

        print('\n DONE! ')

        # Save the model
        idx = alphas > 0
        self.model['X'] = X[idx.reshape(1, -1)[0], :]
        self.model['y'] = Y[idx.reshape(1, -1)[0]]
        self.model['kernelFunction'] = self.kernel
        self.model['b'] = b
        self.model['alphas'] = alphas[idx.reshape(1, -1)[0]]
        self.model['w'] = np.transpose(np.matmul(np.transpose(alphas * Y), X))
        # return model

    def svmPredict(self, X):
        if X.shape[1] == 1:
            X = np.transpose(X)

        # Dataset
        m = X.shape[0]
        p = np.zeros((m, 1))
        pred = np.zeros((m, 1))

        if self.model['kernelFunction'].__name__ == 'linear_kernel':
            p = X.dot(self.model['w']) + self.model['b']

        elif self.model['kernelFunction'].__name__ == 'gaussian_kernel':
            # Vectorized RBF Kernel
            # This is equivalent to computing the kernel
            # on every pair of examples
            X1 = np.sum(np.power(X, 2), axis=1).reshape(-1, 1)
            X2 = np.transpose(np.sum(np.power(self.model['X'], 2), axis=1))
            K = X1 + (X2.T - (2 * (X @ (self.model['X']).T)))
            K = np.power(self.model['kernelFunction'](1, 0, self.sigma), K)
            K = np.transpose(self.model['y']) * K
            K = np.transpose(self.model['alphas']) * K
            p = np.sum(K, axis=1)

        else:
            for i in range(m):
                prediction = 0
                for j in range(self.model['X'].shape[0]):
                    prediction = prediction + self.model['alphas'][j] \
                        * self.model['y'][j] * \
                        self.model['kernelFunction'](np.transpose(
                            X[i, :]), np.transpose(self.model['X'][j, :]))

                p[i] = prediction + self.model['b']

        # Convert predictions into 0 and 1
        pred[p >= 0] = 1
        return pred

    def predict(self, X):
        if X.shape[1] == 1:
            X = np.transpose(X)

        # Dataset
        m = X.shape[0]
        p = np.zeros((m, 1))
        pred = np.zeros((m, 1))

        if self.model['kernelFunction'].__name__ == 'linear_kernel':
            p = X.dot(self.model['w']) + self.model['b']

        elif self.model['kernelFunction'].__name__ == 'gaussian_kernel':
            # Vectorized RBF Kernel
            # This is equivalent to computing the kernel
            # on every pair of examples
            X1 = np.sum(np.power(X, 2), axis=1).reshape(-1, 1)
            X2 = np.transpose(np.sum(np.power(self.model['X'], 2), axis=1))
            K = X1 + (X2.T - (2 * (X @ (self.model['X']).T)))
            K = np.power(self.model['kernelFunction'](1, 0, self.sigma), K)
            K = np.transpose(self.model['y']) * K
            K = np.transpose(self.model['alphas']) * K
            p = np.sum(K, axis=1)

        else:
            for i in range(m):
                prediction = 0
                for j in range(self.model['X'].shape[0]):
                    prediction = prediction + self.model['alphas'][j] \
                        * self.model['y'][j] * \
                        self.model['kernelFunction'](np.transpose(
                            X[i, :]), np.transpose(self.model['X'][j, :]))

                p[i] = prediction + self.model['b']

        # Convert predictions into 0 and 1
        pred[p >= 0] = 1
        return pred

In [8]:
def feature_engineering(data):
    data_new = data.copy() 
    data_new["type1"] = np.nan


    data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('C'),"type1"] = "CC" 
    data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('M'),"type1"] = "CM"
    data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('C'),"type1"] = "MC"
    data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('M'),"type1"] = "MM"
    
    data_new = data_new.drop('type1',1)
    
    data_new = data_new[(data_new["type"] == "CASH_OUT") | (data_new["type"] == "TRANSFER")]
    
    data_new["errorBalanceOrg"] = data_new.newbalanceOrig + data_new.amount - data_new.oldbalanceOrg
    data_new["errorBalanceDest"] = data_new.oldbalanceDest + data_new.amount - data_new.newbalanceDest
    
    # getting rid of nameOrig and nameDest column.
    names = ["nameOrig","nameDest"]
    data_new = data_new.drop(names,1)
    
    # dropping isFlaggedFraud column from the fraud,valid, and new_data datasets
    data_new = data_new.drop("isFlaggedFraud",1)
    
    dataset1 = data_new.copy()


    # adding feature HourOfDay to Dataset1 
    dataset1["HourOfDay"] = np.nan 
    dataset1.HourOfDay = data_new.step % 24
    
    # finalizing dataset
    dataset = dataset1.copy() # unchanged dataset1
    
    # getting one-hot encoding of the 'type' variable
    dataset = pd.get_dummies(dataset,prefix=['type'])
    
    return dataset


# TESTING MY SVM

In [9]:
from pathlib import Path

In [10]:
print("reading dataset...")
# read data in pandas (pd) data frame
data = pd.read_csv("../input/PS_20174392719_1491204439457_log.csv")

reading dataset...


In [11]:
print("applying feature engineering...")
dataset=feature_engineering(data)
# put features & outputs in different data frames
Y = dataset.loc[:, 'isFraud']
X = dataset.drop("isFraud",1)

applying feature engineering...


In [13]:
print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

splitting dataset into train and test sets...


In [14]:
X_train.shape

(2216327, 11)

In [15]:
y_train.shape

(2216327,)

In [16]:
%%time
model = SVM()
model

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 22.9 µs


SVM(kernel=linear_kernel, tol=0.001, C=0.1, max_passes=5, sigma=0.1)

In [17]:
%%time
model.svmTrain(X_train, y_train)

Pre-computing linear_kernel kernel matrix


MemoryError: Unable to allocate 35.7 TiB for an array with shape (2216327, 2216327) and data type float64

In [44]:
y_predicted = model.predict(X_train)

In [45]:
cm, acc, fm = confusionMatrix(y_train, y_predicted)

In [46]:
acc, fm

(0.99625, 0.9940921622686096)

In [47]:
cm

[[2723, 0], [15, 1262]]

In [48]:
from sklearn.metrics import confusion_matrix

In [49]:
confusion_matrix(y_train, y_predicted)

array([[2723,    0],
       [  15, 1262]], dtype=int64)

In [50]:
y_predicted = model.svmPredict(X_test)
cm, acc, fm = confusionMatrix(y_test, y_predicted)
print('Accuracy --> ', acc)
print('Confusion Matrix --> ', cm)

Accuracy -->  0.984
Confusion Matrix -->  [[688, 4], [12, 296]]


In [51]:
confusion_matrix(y_test, y_predicted)

array([[688,   4],
       [ 12, 296]], dtype=int64)