In [2]:
import warnings
warnings.filterwarnings("ignore")



import numpy as np
import pandas as pd
from numpy import linalg
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve, auc,\
precision_score
from sklearn import svm
import scipy.io as spio
import matplotlib.pyplot as plt

In [3]:
class SVM:

    def __init__(self,kernel='linear',learning_rate=0.001, lambda_param=0.01, n_iters=2,batch_size=256):
        self.lr = learning_rate
        self.kernel=kernel
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.batch_size=batch_size
        self.w = None
        self.b = None


    def train_and_evaluate(self, X, y):
        n_samples, n_features = X.shape
        
        y_ = np.where(y <= 0, -1, 1)
        
        self.w = np.zeros(n_features)
        self.b = 0

        for i in range(self.n_iters):
            print('iteration:',i,'/',self.n_iters,'..............')
            for idx, x_i in enumerate(X):
                condition = (y_[idx] * np.dot(x_i,self.w.T) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]


    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)
    

In [4]:
def feature_engineering(data):
    data_new = data.copy() 
    data_new["type1"] = np.nan


    data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('C'),"type1"] = "CC" 
    data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('M'),"type1"] = "CM"
    data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('C'),"type1"] = "MC"
    data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('M'),"type1"] = "MM"
    
    data_new = data_new.drop('type1',1)
    
    data_new = data_new[(data_new["type"] == "CASH_OUT") | (data_new["type"] == "TRANSFER")]
    
    data_new["errorBalanceOrg"] = data_new.newbalanceOrig + data_new.amount - data_new.oldbalanceOrg
    data_new["errorBalanceDest"] = data_new.oldbalanceDest + data_new.amount - data_new.newbalanceDest
    
    # getting rid of nameOrig and nameDest column.
    names = ["nameOrig","nameDest"]
    data_new = data_new.drop(names,1)
    
    # dropping isFlaggedFraud column from the fraud,valid, and new_data datasets
    data_new = data_new.drop("isFlaggedFraud",1)
    
    dataset1 = data_new.copy()


    # adding feature HourOfDay to Dataset1 
    dataset1["HourOfDay"] = np.nan 
    dataset1.HourOfDay = data_new.step % 24
    
    # finalizing dataset
    dataset = dataset1.copy() # unchanged dataset1
    
    # getting one-hot encoding of the 'type' variable
    dataset = pd.get_dummies(dataset,prefix=['type'])
    
    return dataset


In [5]:
print("reading dataset...")
# read data in pandas (pd) data frame
data = pd.read_csv("../input/PS_20174392719_1491204439457_log.csv")

reading dataset...


In [6]:
print("applying feature engineering...")
dataset=feature_engineering(data)
# put features & outputs in different data frames
Y = dataset.loc[:, 'isFraud']
X = dataset.drop("isFraud",1)
Y = np.where(Y == 0, -1, 1)

applying feature engineering...


In [47]:
Y

array([ 1,  1, -1, ...,  1,  1,  1])

In [8]:
print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

splitting dataset into train and test sets...


In [41]:
print("training started...")
clf = SVM(n_iters=1)
clf.train_and_evaluate(X.to_numpy(), Y)
print("training finished.")
print("weights are: {}".format(clf.w))


training started...
iteration: 0 / 1 ..............
training finished.
weights are: [-5.21137922e+02  3.42779711e+03  1.27890127e+04 -1.20737489e+04
  1.80116529e+04 -5.61695333e+02 -2.14349646e+04  2.20011454e+04
 -1.26611084e+01 -6.89355108e-01 -1.13650857e-01]


In [42]:
predictions = clf.predict(X_test)

In [43]:
CM_SVM = confusion_matrix(y_test,predictions)
CR_SVM = classification_report(y_test,predictions)
fpr, recall, thresholds = roc_curve(y_test, predictions)
AUC_SVM = auc(fpr, recall)

results = {"Confusion Matrix":CM_SVM,"Classification Report":CR_SVM,"Area Under Curve":AUC_SVM}

In [44]:
# showing results from Random Forest

for measure in results:
    print(measure,": \n",results[measure])

Confusion Matrix : 
 [[111859 440577]
 [     7   1639]]
Classification Report : 
               precision    recall  f1-score   support

          -1       1.00      0.20      0.34    552436
           1       0.00      1.00      0.01      1646

    accuracy                           0.20    554082
   macro avg       0.50      0.60      0.17    554082
weighted avg       1.00      0.20      0.34    554082

Area Under Curve : 
 0.5991152248360156


In [45]:
r=results['Confusion Matrix']

In [46]:
p=r[0][0]+r[1][1]
total=r[0][0]+r[1][1]+r[1][0]+r[0][1]
p/total

0.20483971686501276

In [None]:
clf = svm.SVC(cache_size=500,max_iter=500)
clf.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)
CM_SVM = confusion_matrix(y_test,predictions)
CR_SVM = classification_report(y_test,predictions)
fpr, recall, thresholds = roc_curve(y_test, predictions)
AUC_SVM = auc(fpr, recall)

results = {"Confusion Matrix":CM_SVM,"Classification Report":CR_SVM,"Area Under Curve":AUC_SVM}

In [None]:
# showing results from Random Forest

for measure in results:
    print(measure,": \n",results[measure])

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)