In [19]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import ADASYN 
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB 
from sklearn import metrics
from pylab import rcParams

rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot

In [5]:
data = pd.read_csv('C:\\Users\\Dell\\Desktop\\creditcard.csv',sep=',')

In [6]:
print('The dataset contains {0} rows and {1} columns.'.format(data.shape[0], data.shape[1]))

The dataset contains 284807 rows and 31 columns.


In [7]:
print('Normal transactions count: ', data['Class'].value_counts().values[0])
print('Fraudulent transactions count: ', data['Class'].value_counts().values[1])

Normal transactions count:  284315
Fraudulent transactions count:  492


In [10]:
# feature data (predictors)
X = data.iloc[:, :-1]

# label class
y = data['Class']

In [11]:
# Scale the data to have zero mean and unit variance.
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [12]:
# Partition data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.33, random_state=42)

In [13]:

# apply the ADASYN over-sampling
ada = ADASYN(random_state=42)
print('Original dataset shape {}'.format(Counter(y_train)))
X_res, y_res = ada.fit_sample(X_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({0: 190477, 1: 343})
Resampled dataset shape Counter({0: 190477, 1: 190471})


In [14]:
X_train, y_train = X_res, y_res 
# Train LogisticRegression Model

LGR_Classifier = LogisticRegression()
LGR_Classifier.fit(X_train, y_train);

# Train Bernoulli Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, y_train);





In [15]:
# Evaluate models
modlist = [('LogisticRegression', LGR_Classifier),
('Naive Baiye Classifier', BNB_Classifier)] 

models = [j for j in modlist]

print()
print('========================== Model Evaluation Results ========================' "\n")  

for i, v in models:
    scores = cross_val_score(v, X_train, y_train, cv=10)
    accuracy = metrics.accuracy_score(y_train, v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(y_train, v.predict(X_train))
    classification = metrics.classification_report(y_train, v.predict(X_train))
    print('===== {} ====='.format(i))
    print()
    print ("Cross Validation Mean Score: ", '{}%'.format(np.round(scores.mean(), 3) * 100))  
    print() 
    print ("Model Accuracy: ", '{}%'.format(np.round(accuracy, 3) * 100)) 
    print()
    print("Confusion Matrix:" "\n", confusion_matrix)
    print()
    print("Classification Report:" "\n", classification) 
    print()



























===== LogisticRegression =====

Cross Validation Mean Score:  87.4%

Model Accuracy:  90.10000000000001%

Confusion Matrix:
 [[174053  16424]
 [ 21296 169175]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90    190477
           1       0.91      0.89      0.90    190471

   micro avg       0.90      0.90      0.90    380948
   macro avg       0.90      0.90      0.90    380948
weighted avg       0.90      0.90      0.90    380948


===== Naive Baiye Classifier =====

Cross Validation Mean Score:  83.6%

Model Accuracy:  84.7%

Confusion Matrix:
 [[170272  20205]
 [ 38005 152466]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.89      0.85    190477
           1       0.88      0.80      0.84    190471

   micro avg       0.85      0.85      0.85    380948
   macro avg       0.85      0.85      0.85    380948
weighted avg       0.85      0.85      

In [16]:
# Test models
classdict = {'normal':0, 'fraudulent':1}
print()
print('========================== Model Test Results ========================' "\n")   

for i, v in models:
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    classification = metrics.classification_report(y_test, v.predict(X_test))   
    print('=== {} ==='.format(i))
    print ("Model Accuracy: ",  '{}%'.format(np.round(accuracy, 3) * 100))
    print()
    print("Confusion Matrix:" "\n", confusion_matrix)
    print()
 #   pf.plot_confusion_matrix(confusion_matrix, classes = list(classdict.keys()), title='Confusion Matrix Plot', cmap=plt.cm.summer)
    print() 
    print("Classification Report:" "\n", classification) 
    print() 




=== LogisticRegression ===
Model Accuracy:  91.2%

Confusion Matrix:
 [[85582  8256]
 [    7   142]]


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95     93838
           1       0.02      0.95      0.03       149

   micro avg       0.91      0.91      0.91     93987
   macro avg       0.51      0.93      0.49     93987
weighted avg       1.00      0.91      0.95     93987


=== Naive Baiye Classifier ===
Model Accuracy:  89.4%

Confusion Matrix:
 [[83840  9998]
 [   10   139]]


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94     93838
           1       0.01      0.93      0.03       149

   micro avg       0.89      0.89      0.89     93987
   macro avg       0.51      0.91      0.49     93987
weighted avg       1.00      0.89      0.94     93987


