In [4]:
import pandas as pd
import numpy as np

In [5]:
creditcard = pd.read_csv('creditcard.csv')

In [6]:
X = creditcard[['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']]
y = creditcard['Class']

In [7]:
from sklearn.model_selection  import train_test_split
from sklearn import metrics
import scipy.optimize as op

In [8]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.65, random_state=0)
print(Xtrain.shape);
print(Xtest.shape);

(99682, 30)
(185125, 30)


In [9]:
def sigmoid(z):
    return 1/(1 + np.exp(-z));

In [10]:
def costFunctionReg(theta, X, y):
    m = len(y)
    n = len(theta)
    h = sigmoid(X.dot(theta))
    J = (-y.T.dot(np.log(h))-(1-y.T).dot(np.log(1-h)))/m
    return J

In [11]:
def Gradient(theta, X, y):
    m = len(y)
    n = len(theta)
    h = sigmoid(X.dot(theta))
    grad = (1/m)*(X.T).dot(h-y);
    return grad.flatten()

In [12]:
def predict(theta, X):
    m, n = X.shape
    p = np.zeros(m)
    h = sigmoid(X.dot(theta))
    for i in range(0, m):
        if h[i] > 0.5:
            p[i] = 1
        else:
            p[i] = 0
    return p

In [13]:
Xtrain = np.array(Xtrain)
ytrain = np.array(ytrain)
Xtest = np.array(Xtest)
ytest = np.array(ytest)

In [14]:
Xtrain_ones = np.append(np.ones((Xtrain.shape[0],1)), Xtrain, axis = 1)

In [15]:
initial_theta = np.zeros(Xtrain_ones.shape[1])
theta_optimal = op.fmin_bfgs(f= costFunctionReg, x0 = initial_theta, args = (Xtrain_ones,ytrain), fprime = Gradient, maxiter = 400);

  """


         Current function value: nan
         Iterations: 3
         Function evaluations: 40
         Gradient evaluations: 40


  """


In [16]:
Xtest_ones = np.append(np.ones((Xtest.shape[0],1)), Xtest,axis = 1);
ypred = predict(theta_optimal,Xtest_ones);
print(metrics.confusion_matrix(ytest,ypred));
print(metrics.classification_report(ytest,ypred));
print('Accuracy : %f' %(metrics.accuracy_score(ytest,ypred)));
print('Area under the curve : %f' %(metrics.roc_auc_score(ytest,ypred)));

[[184739     70]
 [   262     54]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    184809
           1       0.44      0.17      0.25       316

   micro avg       1.00      1.00      1.00    185125
   macro avg       0.72      0.59      0.62    185125
weighted avg       1.00      1.00      1.00    185125

Accuracy : 0.998207
Area under the curve : 0.585254


In [17]:
from sklearn.linear_model import LogisticRegression
clf_logistic = LogisticRegression(penalty='l2');
clf_logistic.fit(Xtrain, ytrain);



In [18]:
ypred = clf_logistic.predict(Xtest);
print(metrics.confusion_matrix(ytest,ypred));
print(metrics.classification_report(ytest,ypred));
print('Accuracy : %f' %(metrics.accuracy_score(ytest,ypred)));
print('Area under the curve : %f' %(metrics.roc_auc_score(ytest,ypred)));

[[184753     56]
 [    91    225]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    184809
           1       0.80      0.71      0.75       316

   micro avg       1.00      1.00      1.00    185125
   macro avg       0.90      0.86      0.88    185125
weighted avg       1.00      1.00      1.00    185125

Accuracy : 0.999206
Area under the curve : 0.855861


In [20]:
class_names = {0:'Not Fraud', 1:'Fraud'}
print(creditcard.Class.value_counts().rename(index = class_names))

Not Fraud    284315
Fraud           492
Name: Class, dtype: int64
