In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
%matplotlib inline
plt.style.use('seaborn-whitegrid')

## Read in the Data

In [2]:
data = pd.read_csv('creditcard.csv')

In [3]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Train-test split : seperate the 'fraud' amount from the 'normal' amount

By seperating the amount by the class label, we will make models for each type of amount given the class label.''

In [4]:
data_reg = data.copy()
data_reg.drop(['Time'], axis=1, inplace=True)
data_reg['log_amount'] = np.log(list(data_reg['Amount'] + 1))
data_reg.drop('Amount', axis=1, inplace=True)

normal_data = data_reg[data['Class'] == 0]
normal_data.drop(['Class'], axis=1, inplace=True)
fraud_data = data_reg[data['Class'] == 1]
fraud_data.drop(['Class'], axis=1, inplace=True)
# fraud_data

normal_y = normal_data['log_amount']
fraud_y = fraud_data['log_amount']

normal_X = normal_data.drop('log_amount', axis=1)
fraud_X = fraud_data.drop('log_amount', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [5]:
np.random.seed(182)
msk = np.random.rand(len(normal_data)) < 0.75
normal_train = normal_data[msk]
normal_test = normal_data[~msk]
# normal_train.shape, normal_test.shape

normal_X_train = normal_train.drop('log_amount', axis=1)
normal_y_train = normal_train['log_amount']

normal_X_test = normal_test.drop('log_amount', axis=1)
normal_y_test = normal_test['log_amount']

In [6]:
msk = np.random.rand(len(fraud_data)) < 0.75
fraud_train = fraud_data[msk]
fraud_test = fraud_data[~msk]

fraud_X_train = fraud_train.drop('log_amount', axis=1)
fraud_y_train = fraud_train['log_amount']

fraud_X_test = fraud_test.drop('log_amount', axis=1)
fraud_y_test = fraud_test['log_amount']

## Self-implemented regression models

### Linear Regression

In [23]:
class LinearRegression:
    """ The self-implemented LinearRegression class that can do multivarible linear regression. The three functions
        we choose to implement is fit, predict, and score and these three methods will make the LinearRegression
        function well. X and y have to be np.arrays.
        
        Methods
        -------
        fit(self, X, y): fit the train set given the response y
        predict(self, X): give the prediction for X
        score(self, X, y): give the r^2 given the input of X and the correct output as a measure of the performance
        
    """
    def __init__(self):
        self.betas = []
        
    def fit(self, X, y):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        
        # the formula: beta = (X^T * X)^-1 * X^T * y
        self.betas = np.linalg.pinv(X.T @ X) @ X.T @ y
    
    def predict(self, X):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        y_hat = np.dot(X, self.betas)
        return y_hat
    
    def score(self, X, y):
        y_hat = self.predict(X)
        
        if len(y_hat) != len(y):
            raise ValueError('Unequal length between the predictions and the true value.')
            
        y_mean = np.mean(y)
        SSE = 0
        SST = 0
        for i in range(len(y_hat)):
            SST += (y[i] - y_mean)**2
            SSE += (y[i] - y_hat[i])**2
        return 1 - SSE / SST
    
    def __str__(self):
        return 'Linear Regresion'

### Ridge Regression

In [27]:
class Rigde:
    """ The self-implemented Ridge class that can do multivarible Ridge regression with penalization lambda. The 
        three functions. We choose to implement is fit, predict, and score and these three methods will make the 
        LinearRegression function well. X and y have to be np.arrays.
        
        Methods
        -------
        fit(self, X, y): fit the train set given the response y
        predict(self, X): give the prediction for X
        score(self, X, y): give the r^2 given the input of X and the correct output as a measure of the performance
        
    """
    
    def __init__(self, alpha):
        self.betas = []
        self.alpha = alpha
        
    def fit(self, X, y):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        
        # construct an identity gamma
        n = X.shape[1]
        gamma = self.alpha * np.identity(n)
        
        betas = np.dot(np.linalg.pinv(X.T @ X + gamma.T @ gamma),X.T @ y)
    
    def predict(self, X):
        #add a column of ones for the coeffcient
        n = X.shape[0]
        ones_col = np.ones((n, 1))
        X = np.concatenate((ones_col, X), axis=1)
        y_hat = np.dot(X, self.betas)
        return y_hat
    
    def score(self, X, y):
        y_hat = self.predict(X)
        
        if len(y_hat) != len(y):
            raise ValueError('Unequal length between the predictions and the true value.')
            
        y_mean = np.mean(y)
        SSE = 0
        SST = 0
        for i in range(len(y_hat)):
            SST += (y[i] - y_mean)**2
            SSE += (y[i] - y_hat[i])**2
        return 1 - SSE / SST
    
    def __str__(self):
        return 'Ridge'

## Fitting the models
---
### linear models

In [30]:
linear_regression = LinearRegression()
linear_regression.fit(normal_X_train.values, normal_y_train.values)
normal_train_score = linear_regression.score(normal_X_train.values, normal_y_train.values)
normal_test_score = linear_regression.score(normal_X_test.values, normal_y_test.values)
print("Normal Transaction linear regression train score:", normal_train_score)
print("Normal Transaction linear regression test score:", normal_test_score)

Normal Transaction linear regression train score: 0.401224393484
Normal Transaction linear regression test score: 0.356168560254


In [31]:
linear_regression = LinearRegression()
linear_regression.fit(fraud_X_train.values, fraud_y_train.values)
fraud_train_score = linear_regression.score(fraud_X_train.values, fraud_y_train.values)
fraud_test_score = linear_regression.score(fraud_X_test.values, fraud_y_test.values)
print("Fraud Transaction linear regression train score:", fraud_train_score)
print("Fraud Transaction linear regression test score:", fraud_test_score)

Fraud Transaction linear regression train score: 0.640395208557
Fraud Transaction linear regression test score: 0.518157716409


### Ridge Regression with Cross Validation

In [11]:
# Tunable parameters of penalization factor lambdas
lambdas = [.001,.005,1,5,10,50,100,500,1000]

In [33]:
from sklearn.linear_model import RidgeCV
ridge_regression = RidgeCV(alphas=lambdas, fit_intercept=True)
ridge_regression.fit(normal_X_train, normal_y_train)

Normal Transaction Ridge regression train Score: 0.401218788257
Normal Transaction Ridge regression test Score: 0.356274466081


In [37]:
print("Normal Transaction Ridge regression train Score:", ridge_regression.score(normal_X_train, normal_y_train))
print("Normal Transaction Ridge regression test Score:", ridge_regression.score(normal_X_test, normal_y_test))

Normal Transaction Ridge regression train Score: 0.256114323087
Normal Transaction Ridge regression test Score: 0.134581447947


In [34]:
ridge_regression = RidgeCV(alphas=lambdas, fit_intercept=True)
ridge_regression.fit(fraud_X_train, fraud_y_train)

print("Fraud Transaction Ridge regression train Score:", ridge_regression.score(fraud_X_train, fraud_y_train))
print("Fraud Transaction Ridge regression test Score:", ridge_regression.score(fraud_X_test, fraud_y_test))

Fraud Transaction Ridge regression train Score: 0.635101029146
Fraud Transaction Ridge regression test Score: 0.528602962983


### Lasso Regression

In [35]:
from sklearn.linear_model import LassoCV
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(normal_X_train, normal_y_train)

print("Normal Transaction Lasso regression train Score:", lasso_regression.score(normal_X_train, normal_y_train))
print("Normal Transaction Lasso regression test Score:", lasso_regression.score(normal_X_test, normal_y_test))

Normal Transaction Lasso regression train Score: 0.400788465934
Normal Transaction Lasso regression test Score: 0.356749565618


In [36]:
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(fraud_X_train, fraud_y_train)

print("Fraud Transaction Lasso regression train Score:", lasso_regression.score(fraud_X_train, fraud_y_train))
print("Fraud Transaction Lasso regression test Score:", lasso_regression.score(fraud_X_test, fraud_y_test))

Fraud Transaction Lasso regression train Score: 0.639963404558
Fraud Transaction Lasso regression test Score: 0.520325320109


## Add polynomial and interation terms

In [16]:
normal_poly = normal_data.copy()
fraud_poly = fraud_data.copy()

columns = list(normal_poly.columns)
predictors = columns[:-1]

for predictor in predictors:
    for i in range(1,3):
        normal_poly[predictor + '_' + str(i)] = normal_poly[predictor]**i
        fraud_poly[predictor + '_' + str(i)] = fraud_poly[predictor]**i

In [17]:
normal_poly.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V24_1,V24_2,V25_1,V25_2,V26_1,V26_2,V27_1,V27_2,V28_1,V28_2
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.066928,0.004479,0.128539,0.016522,-0.189115,0.035764,0.133558,0.017838,-0.021053,0.000443
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.339846,0.115496,0.16717,0.027946,0.125895,0.015849,-0.008983,8.1e-05,0.014724,0.000217
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,-0.689281,0.475108,-0.327642,0.107349,-0.139097,0.019348,-0.055353,0.003064,-0.059752,0.00357
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-1.175575,1.381977,0.647376,0.419096,-0.221929,0.049252,0.062723,0.003934,0.061458,0.003777
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.141267,0.019956,-0.20601,0.04244,0.502292,0.252297,0.219422,0.048146,0.215153,0.046291


In [18]:
msk = np.random.rand(len(normal_poly)) < 0.75
normal_poly_train = normal_poly[msk]
normal_poly_test = normal_poly[~msk]

normal_poly_X_train = normal_poly_train.drop('log_amount', axis=1)
normal_poly_y_train = normal_poly_train['log_amount']

normal_poly_X_test = normal_poly_test.drop('log_amount', axis=1)
normal_poly_y_test = normal_poly_test['log_amount']

In [19]:
msk = np.random.rand(len(fraud_poly)) < 0.75
fraud_poly_train = fraud_poly[msk]
fraud_poly_test = fraud_poly[~msk]

fraud_poly_X_train = fraud_poly_train.drop('log_amount', axis=1)
fraud_poly_y_train = fraud_poly_train['log_amount']

fraud_poly_X_test = fraud_poly_test.drop('log_amount', axis=1)
fraud_poly_y_test = fraud_poly_test['log_amount']

In [20]:
linear_regression = LinearRegression()
linear_regression.fit(normal_poly_X_train.values, normal_poly_y_train.values)
normal_poly_train_score = linear_regression.score(normal_poly_X_train.values, normal_poly_y_train.values)
normal_poly_test_score = linear_regression.score(normal_poly_X_test.values, normal_poly_y_test.values)
print("Normal Transaction linear regression with polynomial terms train score:", normal_poly_train_score)
print("Normal Transaction linear regression with polynomial terms test score:", normal_poly_test_score)

Normal Transaction linear regression with polynomial terms train score: 0.456631094621
Normal Transaction linear regression with polynomial terms test score: 0.464282964492


In [21]:
linear_regression = LinearRegression()
linear_regression.fit(fraud_poly_X_train.values, fraud_poly_y_train.values)
fraud_poly_train_score = linear_regression.score(fraud_poly_X_train.values, fraud_poly_y_train.values)
fraud_poly_test_score = linear_regression.score(fraud_poly_X_test.values, fraud_poly_y_test.values)
print("Fraud Transaction linear regression with polynomial terms train score:", fraud_poly_train_score)
print("Fraud Transaction linear regression with polynomial terms test score:", fraud_poly_test_score)

Fraud Transaction linear regression with polynomial terms train score: 0.702725809817
Fraud Transaction linear regression with polynomial terms test score: 0.615380750667


## Ridge Regression for Linear Regression with Polynomial Terms

In [None]:
ridge_regression = RidgeCV(alphas=lambdas, fit_intercept=True)
ridge_regression.fit(normal_poly_X_train, normal_poly_y_train)

print("Normal Transaction Ridge regression with polynomial terms train score:", \
      ridge_regression.score(normal_poly_X_train, normal_poly_y_train))
print("Normal Transaction Ridge regression with polynomial terms test score:", \
      ridge_regression.score(normal_poly_X_test, normal_poly_y_test))

In [None]:
ridge_regression = RidgeCV(alphas=lambdas, fit_intercept=True)
ridge_regression.fit(fraud_poly_X_train, fraud_poly_y_train)

print("Fraud Transaction Ridge regression with polynomial terms train score:", \
      ridge_regression.score(fraud_poly_X_train, fraud_poly_y_train))
print("Fraud Transaction Ridge regression with polynomial terms test score:", \
      ridge_regression.score(fraud_poly_X_test, fraud_poly_y_test))

## Lasso Regression for Linear Regression with Polynomial Terms

In [None]:
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(normal_poly_X_train, normal_poly_y_train)

print("Normal Transaction Lasso regression with polynomial terms train score:", \
      lasso_regression.score(normal_poly_X_train, normal_poly_y_train))
print("Normal Transaction Lasso regression with polynomial terms test score:", \
      lasso_regression.score(normal_poly_X_test, normal_poly_y_test))

In [None]:
lasso_regression = LassoCV(alphas=lambdas, fit_intercept=True)
lasso_regression.fit(fraud_poly_X_train, fraud_poly_y_train)

print("Fraud Transaction Lasso regression with polynomial terms train score:", \
      lasso_regression.score(fraud_poly_X_train, fraud_poly_y_train))
print("Fraud Transaction Lasso regression with polynomial terms test score:", \
      lasso_regression.score(fraud_poly_X_test, fraud_poly_y_test))

## k-Nearest Neighbors

In [None]:
knn_model = KNeighborsRegressor(n_neighbors = 10)
knn_model.fit(normal_X_train, normal_y_train)
knn_model.score(normal_X_train, normal_y_train)

In [None]:
knn_model.score(normal_X_test, normal_y_test)