In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('creditcard.csv')

print("Shape of the Dataset: ", data.shape) # number of rows and columns in our dataset
print("\n\n", data.columns) # columns/features in our Dataset

In [None]:
data.head() # first five records

In [None]:
data.tail() # last five records

In [None]:
data = data.sample(frac = 0.4, random_state = 42) # using 40% of our dataset for next steps
print("Shape of the Dataset: ", data.shape)

In [None]:
# Determine number of fraud cases in Dataset

Fraud = data[data['Class'] == 1]
Valid = data[data['Class'] == 0]

outlier_fraction = (len(Fraud)/float(len(Valid)))
print("Outlier_fraction: {0} %".format(outlier_fraction*100))

print('Fraud Cases: {}'.format(len(data[data['Class'] == 1])))
print('Valid Transactions: {}'.format(len(data[data['Class'] == 0])))

In [None]:
print("Description of the Dataset: ", data.describe())

In [None]:
data.hist(figsize = (15, 15))
plt.show()

In [None]:
corrmat = data.corr()
fig = plt.figure(figsize = (15, 15))

In [None]:
corrmat['Class']

In [None]:
cols = corrmat.keys()
cols_to_keep = []

for i in range(len(corrmat)):

    if abs(corrmat['Class'][i]) > 0.01:

        cols_to_keep.append(cols[i])

print(cols_to_keep)

In [None]:
features = cols_to_keep[:-1]

In [None]:
feature = data[cols] # records of all transactions, excluding the target class
target = data["Class"] # records of the corresponding label for each record

In [None]:
clf = IsolationForest(max_samples = len(features),
                                        contamination = outlier_fraction)

In [None]:
n_outliers = len(Fraud)

In [None]:
clf.fit(feature)
        # generate predictions
scores_pred = clf.decision_function(feature)
y_pred = clf.predict(feature)

    # Reshape the prediction values to 0 for valid, 1 for fraud.

y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

n_errors = (y_pred != target).sum()


In [None]:
# clf = LocalOutlierFactor(
#         n_neighbors = 20,
#         contamination = outlier_fraction)

# y_pred = clf.fit_predict(features)
# scores_pred = clf.negative_outlier_factor_

# y_pred[y_pred == 1] = 0
# y_pred[y_pred == -1] = 1

# n_errors = (y_pred != target).sum()

In [None]:
print('Number of Errors: ', n_errors)
print('Accuracy: ', accuracy_score(target, y_pred)*100)
print(classification_report(target, y_pred))