# Credit Card Fraud Detection Using Logistic Regression
# Dataset from Kaggle: https://www.kaggle.com/mlg-ulb/creditcardfraud


In [None]:
import numpy as np
import sklearn as sk
import pandas as pd

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from time import time
import random

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
cc_df = pd.read_csv('data/creditcard.csv', low_memory=False)
cc_df.head()

In [None]:
print(list(cc_df.columns))

In [None]:
cc_df['Class'].value_counts()

# 0 is  Non-Fraud & 1 is Fraud. Very few cases of fraud transactions.

In [None]:
fraud = cc_df.loc[cc_df['Class'] == 1]
non_fraud = cc_df.loc[cc_df['Class'] == 0]

sns.countplot(x='Class', data=cc_df, palette="muted")
plt.show()

# Partitioning the data

In [None]:
X = cc_df.iloc[:,:-1]
y = cc_df['Class']
X_scaled = scale(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.30, random_state=500)

In [None]:
logit =LogisticRegression(solver='lbfgs')

In [None]:
logit_model = logit.fit( X_train,y_train)

In [None]:
predictions = logit.predict(X_test)

pred_fraud = np.where(predictions == 1)[0]
real_fraud = np.where(y_test == 1)[0]
false_pos = len(np.setdiff1d(pred_fraud, real_fraud))

pred_good = np.where(predictions == 0)[0]
real_good = np.where(y_test == 0)[0]
false_neg = len(np.setdiff1d(pred_good, real_good))
false_neg_rate = false_neg/(false_pos+false_neg)

accuracy = (len(X_test) - (false_neg + false_pos))/ len(X_test)

print("Accuracy", accuracy)
print("False Negative Rate (Misclassification):", false_neg_rate)
print("False Negative Rate (All Data)", false_neg/len(predictions))

print("False Negatives:", false_neg," False Positives", false_pos,"Mispredictions", false_neg + false_pos )

print("Total test data points", len(X_test), "Train data points",len(X_train))