This project covers credit card fraud and is meant to look at a dataset of transactions and predict whether it is fraudlent or not.

# Imports

In [None]:
import numpy as np
import pandas as pd # used for data processing
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import sklearn

# Data Importing

In [None]:
data = pd.read_csv('creditcard.csv')

# Exploring the Dataset

In [None]:
print(data.columns)

In [None]:
data.shape

In [None]:
# random_state helps assure that you always get the same output when you split the data
# this helps create reproducible results and it does not actually matter what the number is
# frac is percentage of the data that will be returned
data = data.sample(frac = 0.2, random_state = 1)
print(data.shape)

In [None]:
# plot the histogram of each parameter
data.hist(figsize = (20, 20))
plt.show()

You can see most of the V's clustered around 0 with some or no outlier.In our dataset we have very few fraudlent cases over valid cases in our class histogram.

In [None]:
# determine the number of fraud cases
fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]

outlier_fraction = len(fraud) / float(len(valid))
print(outlier_fraction)

print('Fraud Cases: {}'.format(len(fraud)))
print('Valid Cases: {}'.format(len(valid)))

In [None]:
# correlation matrix
corrmat = data.corr()
fig = plt.figure(figsize = (12, 9))

sns.heatmap(corrmat, vmax = .8, square = True)
plt.show()

In the above heatmap a lot of the values are close to 0. Most of them are fairly unrelated.The lighter squares signify a stronger correlation.

# Organizing the Data

In [None]:
# get the columns from the dataframe
columns = data.columns.tolist()

# filter the columns to remove the data we do not want
columns = [c for c in columns if c not in ['Class']]

# store the variable we will be predicting on which is class
target = 'Class'

# X includes everything except our class column
X = data[columns]
# Y includes all the class labels for each sample
# this is also one-dimensional
Y = data[target]

# print the shapes of X and Y
print(X.shape)
print(Y.shape)

# Applying Algorithms

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [None]:
# define a random state
state = 1

# define the outlier detection methods
classifiers = {
    # contamination is the number of outliers we think there are
    'Isolation Forest': IsolationForest(max_samples = len(X),
                                       contamination = outlier_fraction,
                                       random_state = state),
    # number of neighbors to consider, the higher the percentage of outliers the higher you want to make this number
    'Local Outlier Factor': LocalOutlierFactor(
    n_neighbors = 20,
    contamination = outlier_fraction)
}

# Fit the Model

In [None]:
n_outliers = len(fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):
    
    # fit the data and tag outliers
    if clf_name == 'Local Outlier Factor':
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
        
    # reshape the prediction values to 0 for valid and 1 for fraud
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    # calculate the number of errors
    n_errors = (y_pred != Y).sum()
    
    # classification matrix
    print('{}: {}'.format(clf_name, n_errors))
    print(accuracy_score(Y, y_pred))
    print(classification_report(Y, y_pred))

Looking at precision for fraudulent cases.Let us know the percentage of cases that are getting correctly labeled.'Precision' accounts for false-positives.'Recall' accounts for false-negatives.Low numbers could mean that are constantly calling clients asking them if they actually made the transaction which could be annoying.

Our Isolation Forest method (which is Random Forest based) was able to produce a better result.Looking at the f1-score 26% of the time we are going to detect the fraudulent transactions.