In [132]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
%matplotlib inline
plt.style.use('seaborn-whitegrid')

## Read in the Data

In [2]:
data = pd.read_csv('creditcard.csv')

In [3]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Train-test split : seperate the 'fraud' amount from the 'normal' amount

By seperating the amount by the class label, we will make models for each type of amount given the class label.''

In [4]:
data_reg = data.copy()
data_reg.drop(['Time'], axis=1, inplace=True)
data_reg['log_amount'] = np.log(list(data_reg['Amount'] + 1))
data_reg.drop('Amount', axis=1, inplace=True)

normal_data = data_reg[data['Class'] == 0]
normal_data.drop(['Class'], axis=1, inplace=True)
fraud_data = data_reg[data['Class'] == 1]
fraud_data.drop(['Class'], axis=1, inplace=True)
# fraud_data

normal_y = normal_data['log_amount']
fraud_y = fraud_data['log_amount']

normal_X = normal_data.drop('log_amount', axis=1)
fraud_X = fraud_data.drop('log_amount', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [5]:
np.random.seed(182)
msk = np.random.rand(len(normal_data)) < 0.75
normal_train = normal_data[msk]
normal_test = normal_data[~msk]
# normal_train.shape, normal_test.shape

normal_X_train = normal_train.drop('log_amount', axis=1)
normal_y_train = normal_train['log_amount']

normal_X_test = normal_test.drop('log_amount', axis=1)
normal_y_test = normal_test['log_amount']

In [6]:
msk = np.random.rand(len(fraud_data)) < 0.75
fraud_train = fraud_data[msk]
fraud_test = fraud_data[~msk]

fraud_X_train = fraud_train.drop('log_amount', axis=1)
fraud_y_train = fraud_train['log_amount']

fraud_X_test = fraud_test.drop('log_amount', axis=1)
fraud_y_test = fraud_test['log_amount']

## Self-implemented regression models

### Linear Regression

In [7]:
class LinearRegression:
    """ The self-implemented LinearRegression class that can do multivarible linear regression. The three functions
        we choose to implement is fit, predict, and score and these three methods will make the LinearRegression
        function well. X and y have to be np.arrays.
        
        Methods
        -------
        fit(self, X, y): fit the train set given the response y
        predict(self, X): give the prediction for X
        score(self, X, y): give the r^2 given the input of X and the correct output as a measure of the performance
        
    """
    def __init__(self):
        self.betas = []
        
    def fit(self, X, y):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        
        # the formula: beta = (X^T * X)^-1 * X^T * y
        self.betas = np.linalg.pinv(X.T @ X) @ X.T @ y
    
    def predict(self, X):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        y_hat = np.dot(X, self.betas)
        return y_hat
    
    def score(self, X, y):
        y_hat = self.predict(X)
        
        if len(y_hat) != len(y):
            raise ValueError('Unequal length between the predictions and the true value.')
            
        y_mean = np.mean(y)
        SSE = 0
        SST = 0
        for i in range(len(y_hat)):
            SST += (y[i] - y_mean)**2
            SSE += (y[i] - y_hat[i])**2
        return 1 - SSE / SST
    
    def __str__(self):
        return 'Linear Regresion'

### Ridge Regression

In [41]:
class Ridge:
    """ The self-implemented Ridge class that can do multivarible Ridge regression with penalization lambda. The 
        three functions. We choose to implement is fit, predict, and score and these three methods will make the 
        LinearRegression function well. X and y have to be np.arrays.
        
        Methods
        -------
        fit(self, X, y): fit the train set given the response y
        predict(self, X): give the prediction for X
        score(self, X, y): give the r^2 given the input of X and the correct output as a measure of the performance
        
    """
    
    def __init__(self, alpha):
        self.betas = []
        self.alpha = alpha
        
    def fit(self, X, y):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        
        # construct an identity gamma
        n = X.shape[1]
        gamma = self.alpha * np.identity(n)
        
        self.betas = np.dot(np.linalg.pinv(X.T @ X + gamma.T @ gamma),X.T @ y)
    
    def predict(self, X):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        y_hat = np.dot(X, self.betas)
        return y_hat
    
    def score(self, X, y):
        y_hat = self.predict(X)
        
        if len(y_hat) != len(y):
            raise ValueError('Unequal length between the predictions and the true value.')
            
        y_mean = np.mean(y)
        SSE = 0
        SST = 0
        for i in range(len(y_hat)):
            SST += (y[i] - y_mean)**2
            SSE += (y[i] - y_hat[i])**2
        return 1 - SSE / SST
    
    def __str__(self):
        return 'Ridge'

In [134]:
class KNNRegressor:
    """ The Self-implemented KNN model take in the train set and use that as the database. To predict, the model
        will calculate K nearest neighbor based on the distance metric (for instance, Euclidean distance) and
        average the output to give the prediction.
        
        Methods
        -------
        fit: fit the model by storing the entire train
        predict: based on how many neighbors to use the model yeild the prediction
        score(self, X, y): give the r^2 given the input of X and the correct output as a measure of the performance 
    """
    
    
    def __init__(self, K):
        self.K = K
        self.X = None
        
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def predict(self, X_predict):
        # final prediction
        y_hat = []
        # Calculate the distance for each input observation
        for index, x_predict in enumerate(X_predict):
            distance_index = {}
            for index, x_train in enumerate(self.X):
                distance_index[self._distance(x_predict, x_train)] = index
            distance_values = sorted(distance_index.keys())
            y_hat_predictions = []
            for i in range(self.K):
                y_hat_predictions.append(self.y[distance_index[distance_values[i]]])
            y_hat.append(np.mean(y_hat_predictions))
        return y_hat
    
    def score(self, X, y):
        y_hat = self.predict(X)
        
        if len(y_hat) != len(y):
            raise ValueError('Unequal length between the predictions and the true value.')
            
        y_mean = np.mean(y)
        SSE = 0
        SST = 0
        for i in range(len(y_hat)):
            SST += (y[i] - y_mean)**2
            SSE += (y[i] - y_hat[i])**2
        return 1 - SSE / SST 
    
    def _distance(self, x1, x2):
        if len(x1) != len(x2):
            raise ValueError('Vectors are of different length')
        sum = 0
        for i in range(len(x1)):
            sum += (x1[i] - x2[i])**2
        return np.sqrt(sum)

## Fitting the models
---
### linear models

In [9]:
linear_regression = LinearRegression()
linear_regression.fit(normal_X_train.values, normal_y_train.values)
normal_train_score = linear_regression.score(normal_X_train.values, normal_y_train.values)
normal_test_score = linear_regression.score(normal_X_test.values, normal_y_test.values)
print("Normal Transaction linear regression train score:", normal_train_score)
print("Normal Transaction linear regression test score:", normal_test_score)

Normal Transaction linear regression train score: 0.401224393484
Normal Transaction linear regression test score: 0.356168560254


linear_regression = LinearRegression()
linear_regression.fit(fraud_X_train.values, fraud_y_train.values)
fraud_train_score = linear_regression.score(fraud_X_train.values, fraud_y_train.values)
fraud_test_score = linear_regression.score(fraud_X_test.values, fraud_y_test.values)
print("Fraud Transaction linear regression train score:", fraud_train_score)
print("Fraud Transaction linear regression test score:", fraud_test_score)

### Ridge Regression with Cross Validation tuning the penalizing $\lambda$

In [17]:
# Tunable parameters of penalization factor lambdas
lambdas = [.001,.005,0.01,5,10,50,100,500,1000]

#### Cross validation for predicting the normal amount

In [61]:
# Cross validation for predicting the normal amount
# split the train into 10 parts to do the cross validation
kf = KFold(n_splits = 10)
# store the validation score for each lambda
validation_scores = []
# cross validation
for alpha in lambdas:
    cur_scores = []
    for train_index, test_index in kf.split(normal_X_train):
        X_train_10, X_val_10 = normal_X_train.iloc[train_index].values, normal_X_train.iloc[test_index].values
        y_train_10, y_val_10 = normal_y_train.iloc[train_index].values, normal_y_train.iloc[test_index].values
        ridge = Ridge(alpha)
        ridge.fit(X_train_10, y_train_10)
        cur_scores.append(ridge.score(X_train_10, y_train_10))
    validation_scores.append(np.mean(cur_scores))
print('The best penalizing parameter lambda is {}'.format(lambdas[np.argmax(validation_scores)]))

The best penalizing parameter lambda is 0.001


In [65]:
# Fit the Ridge regression with the lambda giving the best cross validation scores
ridge_regression = Ridge(0.001)
ridge_regression.fit(normal_X_train.values, normal_y_train.values)
print("Normal Transaction Ridge regression train Score:", ridge_regression.score(normal_X_train.values, normal_y_train.values))
print("Normal Transaction Ridge regression test Score:", ridge_regression.score(normal_X_test.values, normal_y_test.values))

Normal Transaction Ridge regression train Score: 0.401224393484
Normal Transaction Ridge regression test Score: 0.356168560254


#### Cross validation for predicting the fraud amount

In [66]:
# Cross validation for predicting the normal amount
# split the train into 10 parts to do the cross validation
kf = KFold(n_splits = 10)
# store the validation score for each lambda
validation_scores = []
# cross validation
for alpha in lambdas:
    cur_scores = []
    for train_index, test_index in kf.split(fraud_X_train):
        X_train_10, X_val_10 = fraud_X_train.iloc[train_index].values, fraud_X_train.iloc[test_index].values
        y_train_10, y_val_10 = fraud_y_train.iloc[train_index].values, fraud_y_train.iloc[test_index].values
        ridge = Ridge(alpha)
        ridge.fit(X_train_10, y_train_10)
        cur_scores.append(ridge.score(X_train_10, y_train_10))
    validation_scores.append(np.mean(cur_scores))
print('The best penalizing parameter lambda is {}'.format(lambdas[np.argmax(validation_scores)]))

The best penalizing parameter lambda is 0.001


In [68]:
# Fit the Ridge regression with the lambda giving the best cross validation scores
ridge_regression = Ridge(0.001)
ridge_regression.fit(fraud_X_train.values, fraud_y_train.values)
print("Normal Transaction Ridge regression train Score:", ridge_regression.score(fraud_X_train.values, fraud_y_train.values))
print("Normal Transaction Ridge regression test Score:", ridge_regression.score(fraud_X_test.values, fraud_y_test.values))

Normal Transaction Ridge regression train Score: 0.640395208557
Normal Transaction Ridge regression test Score: 0.518157718729


### Lasso Regression
We use the built in package from sklearn to perform Lasso Regression on the train set.

In [71]:
from sklearn.linear_model import LassoCV
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(normal_X_train, normal_y_train)

print("Normal Transaction Lasso regression train Score:", lasso_regression.score(normal_X_train, normal_y_train))
print("Normal Transaction Lasso regression test Score:", lasso_regression.score(normal_X_test, normal_y_test))

Normal Transaction Lasso regression train Score: 0.399679122735
Normal Transaction Lasso regression test Score: 0.356152158435


In [72]:
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(fraud_X_train, fraud_y_train)

print("Fraud Transaction Lasso regression train Score:", lasso_regression.score(fraud_X_train, fraud_y_train))
print("Fraud Transaction Lasso regression test Score:", lasso_regression.score(fraud_X_test, fraud_y_test))

Fraud Transaction Lasso regression train Score: 0.639108711423
Fraud Transaction Lasso regression test Score: 0.522291864353


## Add polynomial and interation terms

In [73]:
normal_poly = normal_data.copy()
fraud_poly = fraud_data.copy()

columns = list(normal_poly.columns)
predictors = columns[:-1]

for predictor in predictors:
    for i in range(1,3):
        normal_poly[predictor + '_' + str(i)] = normal_poly[predictor]**i
        fraud_poly[predictor + '_' + str(i)] = fraud_poly[predictor]**i

In [74]:
normal_poly.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V24_1,V24_2,V25_1,V25_2,V26_1,V26_2,V27_1,V27_2,V28_1,V28_2
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.066928,0.004479,0.128539,0.016522,-0.189115,0.035764,0.133558,0.017838,-0.021053,0.000443
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.339846,0.115496,0.16717,0.027946,0.125895,0.015849,-0.008983,8.1e-05,0.014724,0.000217
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,-0.689281,0.475108,-0.327642,0.107349,-0.139097,0.019348,-0.055353,0.003064,-0.059752,0.00357
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-1.175575,1.381977,0.647376,0.419096,-0.221929,0.049252,0.062723,0.003934,0.061458,0.003777
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.141267,0.019956,-0.20601,0.04244,0.502292,0.252297,0.219422,0.048146,0.215153,0.046291


In [75]:
msk = np.random.rand(len(normal_poly)) < 0.75
normal_poly_train = normal_poly[msk]
normal_poly_test = normal_poly[~msk]

normal_poly_X_train = normal_poly_train.drop('log_amount', axis=1)
normal_poly_y_train = normal_poly_train['log_amount']

normal_poly_X_test = normal_poly_test.drop('log_amount', axis=1)
normal_poly_y_test = normal_poly_test['log_amount']

In [76]:
msk = np.random.rand(len(fraud_poly)) < 0.75
fraud_poly_train = fraud_poly[msk]
fraud_poly_test = fraud_poly[~msk]

fraud_poly_X_train = fraud_poly_train.drop('log_amount', axis=1)
fraud_poly_y_train = fraud_poly_train['log_amount']

fraud_poly_X_test = fraud_poly_test.drop('log_amount', axis=1)
fraud_poly_y_test = fraud_poly_test['log_amount']

In [77]:
linear_regression = LinearRegression()
linear_regression.fit(normal_poly_X_train.values, normal_poly_y_train.values)
normal_poly_train_score = linear_regression.score(normal_poly_X_train.values, normal_poly_y_train.values)
normal_poly_test_score = linear_regression.score(normal_poly_X_test.values, normal_poly_y_test.values)
print("Normal Transaction linear regression with polynomial terms train score:", normal_poly_train_score)
print("Normal Transaction linear regression with polynomial terms test score:", normal_poly_test_score)

Normal Transaction linear regression with polynomial terms train score: 0.461756200374
Normal Transaction linear regression with polynomial terms test score: 0.447337215489


In [78]:
linear_regression = LinearRegression()
linear_regression.fit(fraud_poly_X_train.values, fraud_poly_y_train.values)
fraud_poly_train_score = linear_regression.score(fraud_poly_X_train.values, fraud_poly_y_train.values)
fraud_poly_test_score = linear_regression.score(fraud_poly_X_test.values, fraud_poly_y_test.values)
print("Fraud Transaction linear regression with polynomial terms train score:", fraud_poly_train_score)
print("Fraud Transaction linear regression with polynomial terms test score:", fraud_poly_test_score)

Fraud Transaction linear regression with polynomial terms train score: 0.72659866445
Fraud Transaction linear regression with polynomial terms test score: 0.532846917532


From the above statistics we notice that for fraud transaction we have a training socre that is much higher than that of the test set. This is a sign of overfitting so we again choose to perform Ridge and Lasso regression to reduce overfitting.

## Ridge Regression for Linear Regression with Polynomial Terms

#### Cross validation for predicting the normal amount

In [79]:
# Cross validation for predicting the normal amount with polynomial terms
# split the train into 10 parts to do the cross validation
kf = KFold(n_splits = 10)
# store the validation score for each lambda
validation_scores = []
# cross validation
for alpha in lambdas:
    cur_scores = []
    for train_index, test_index in kf.split(normal_poly_X_train):
        X_train_10, X_val_10 = normal_poly_X_train.iloc[train_index].values, normal_poly_X_train.iloc[test_index].values
        y_train_10, y_val_10 = normal_poly_y_train.iloc[train_index].values, normal_poly_y_train.iloc[test_index].values
        ridge = Ridge(alpha)
        ridge.fit(X_train_10, y_train_10)
        cur_scores.append(ridge.score(X_train_10, y_train_10))
    validation_scores.append(np.mean(cur_scores))
print('The best penalizing parameter lambda is {}'.format(lambdas[np.argmax(validation_scores)]))

The best penalizing parameter lambda is 0.01


In [85]:
ridge_regression_poly = Ridge(0.01)
ridge_regression_poly.fit(normal_poly_X_train.values, normal_poly_y_train.values)

print("Normal Transaction Ridge regression with polynomial terms train score:", \
      ridge_regression_poly.score(normal_poly_X_train.values, normal_poly_y_train.values))#### Cross validation for predicting the normal amount
print("Normal Transaction Ridge regression with polynomial terms test score:", \
      ridge_regression_poly.score(normal_poly_X_test.values, normal_poly_y_test.values))

Normal Transaction Ridge regression with polynomial terms train score: 0.461756200374
Normal Transaction Ridge regression with polynomial terms test score: 0.447337205257


#### Cross validation for predicting the fraud amount

In [91]:
# Cross validation for predicting the normal amount with polynomial terms
# split the train into 10 parts to do the cross validation
kf = KFold(n_splits = 10)
# store the validation score for each lambda
validation_scores = []
# cross validation
for alpha in lambdas:
    cur_scores = []
    for train_index, test_index in kf.split(fraud_poly_X_train):
        X_train_10, X_val_10 = fraud_poly_X_train.iloc[train_index].values, fraud_poly_X_train.iloc[test_index].values
        y_train_10, y_val_10 = fraud_poly_y_train.iloc[train_index].values, fraud_poly_y_train.iloc[test_index].values
        ridge = Ridge(alpha)
        ridge.fit(X_train_10, y_train_10)
        cur_scores.append(ridge.score(X_train_10, y_train_10))
    validation_scores.append(np.mean(cur_scores))
print('The best penalizing parameter lambda is {}'.format(lambdas[np.argmax(validation_scores)]))

The best penalizing parameter lambda is 0.005


In [92]:
ridge_regression_poly = Ridge(0.005)
ridge_regression_poly.fit(fraud_poly_X_train.values, fraud_poly_y_train.values)

print("Fraud Transaction Ridge regression with polynomial terms train score:", \
      ridge_regression.score(fraud_poly_X_train.values, fraud_poly_y_train.values))
print("Fraud Transaction Ridge regression with polynomial terms test score:", \
      ridge_regression.score(fraud_poly_X_test.values, fraud_poly_y_test.values))

Fraud Transaction Ridge regression with polynomial terms train score: 0.714269514911
Fraud Transaction Ridge regression with polynomial terms test score: 0.571867805349


## Lasso Regression for Linear Regression with Polynomial Terms

In [95]:
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(normal_poly_X_train, normal_poly_y_train)

print("Normal Transaction Lasso regression with polynomial terms train score:", \
      lasso_regression.score(normal_poly_X_train, normal_poly_y_train))
print("Normal Transaction Lasso regression with polynomial terms test score:", \
      lasso_regression.score(normal_poly_X_test, normal_poly_y_test))

Normal Transaction Lasso regression with polynomial terms train score: 0.458631677515
Normal Transaction Lasso regression with polynomial terms test score: 0.445074861588


In [96]:
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(fraud_poly_X_train, fraud_poly_y_train)

print("Fraud Transaction Lasso regression with polynomial terms train score:", \
      lasso_regression.score(fraud_poly_X_train, fraud_poly_y_train))
print("Fraud Transaction Lasso regression with polynomial terms test score:", \
      lasso_regression.score(fraud_poly_X_test, fraud_poly_y_test))

Fraud Transaction Lasso regression with polynomial terms train score: 0.169866327262
Fraud Transaction Lasso regression with polynomial terms test score: 0.140171607407




## k-Nearest Neighbors for Fraud Amount using k = 10

In [137]:
# Using the self-implemented KNN for predicting fraud transaction amount
knn = KNNRegressor(10)
knn.fit(fraud_X_train.values, fraud_y_train.values)
print("Fraud transaction using KNN train score: {}".format(knn.score(fraud_X_train.values, fraud_y_train.values)))
print("Fraud transaction using KNN test score: {}".format(knn.score(fraud_X_test.values, fraud_y_test.values)))

Fraud transaction using KNN train score: 0.5752330189704606
Fraud transaction using KNN test score: 0.3557295374866566
