In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from random import seed,sample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import classification_report,recall_score,confusion_matrix,accuracy_score, roc_curve, auc,\
precision_score


In [3]:


# >> FEATURE SELECTION << #
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped




In [4]:
def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped




In [5]:
##############################



def compute_cost(W, X, Y):
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost




In [22]:

def calculate_cost_gradient(W, X_batch, Y_batch):
    d =1 - (Y_batch * np.dot(X_batch, W))
    #print(distance)
    dw = np.zeros(len(W))


    if max(0, d) == 0:
        di = W
    else:
        di = W - (regularization_strength * Y_batch * X_batch)
    dw += di

    #dw = dw/len(Y_batch)  # average
    return dw




In [7]:
def sgd(features, outputs):
    max_epochs = 2
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights




In [8]:
def feature_engineering(data):
    data_new = data.copy() 
    data_new["type1"] = np.nan


    data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('C'),"type1"] = "CC" 
    data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('M'),"type1"] = "CM"
    data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('C'),"type1"] = "MC"
    data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('M'),"type1"] = "MM"
    
    data_new = data_new.drop('type1',1)
    
    data_new = data_new[(data_new["type"] == "CASH_OUT") | (data_new["type"] == "TRANSFER")]
    
    data_new["errorBalanceOrg"] = data_new.newbalanceOrig + data_new.amount - data_new.oldbalanceOrg
    data_new["errorBalanceDest"] = data_new.oldbalanceDest + data_new.amount - data_new.newbalanceDest
    
    # getting rid of nameOrig and nameDest column.
    names = ["nameOrig","nameDest"]
    data_new = data_new.drop(names,1)
    
    # dropping isFlaggedFraud column from the fraud,valid, and new_data datasets
    data_new = data_new.drop("isFlaggedFraud",1)
    
    dataset1 = data_new.copy()


    # adding feature HourOfDay to Dataset1 
    dataset1["HourOfDay"] = np.nan 
    dataset1.HourOfDay = data_new.step % 24
    
    # finalizing dataset
    dataset = dataset1.copy() # unchanged dataset1
    
    # getting one-hot encoding of the 'type' variable
    dataset = pd.get_dummies(dataset,prefix=['type'])
    
    return dataset


In [9]:
print("reading dataset...")
# read data in pandas (pd) data frame
data = pd.read_csv("../input/PS_20174392719_1491204439457_log.csv")

reading dataset...


In [10]:
print("applying feature engineering...")
dataset=feature_engineering(data)
# put features & outputs in different data frames
Y = dataset.loc[:, 'isFraud']
X = dataset.drop("isFraud",1)

applying feature engineering...


In [14]:
regularization_strength = 1000
learning_rate = 0.01

In [15]:

# split data into train and test set
print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


splitting dataset into train and test sets...


In [16]:
# train the model
print("training started...")
W = sgd(X_train.to_numpy(), y_train.to_numpy())
print("training finished.")
print("weights are: {}".format(W))


training started...
Epoch is: 1 and Cost is: 24100.56665062967
training finished.
weights are: [1.23893955e-002 3.14806445e+000 3.14806445e+000 2.42092166e-322
 2.24607965e-011 2.61619985e-011 6.40754477e-060 3.14806445e+000
 1.69717747e-004 4.74337388e-017 1.69717747e-005]


In [21]:
# testing the model
print("testing the model...")

y_test_predicted = np.array([])
print(X_test.shape[0])
    
yp = np.array(np.sign(X_test.to_numpy()@W.T),dtype=int)

y_test_predicted = np.append(y_test_predicted, yp)

print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)*100))
    

testing the model...
554082
accuracy on test dataset: 0.2965265069069199


In [19]:
yp

array([ 1,  1,  1, ...,  1,  1, -1])