###  Import Required Libraries 
Import the required libraries and print the version to ensure they are successfully insatalled

In [None]:
import sys
import numpy
import pandas
import matplotlib
import seaborn
import scipy
import sklearn

print("Python: {}".format(sys.version))
print("Numpy: {}".format(numpy.__version__))
print("Pandas: {}".format(pandas.__version__))
print("Matplotlib: {}".format(matplotlib.__version__))
print("Seaborn: {}".format(seaborn.__version__))
print("Scipy: {}".format(scipy.__version__))
print("Sklearn: {}".format(scipy.__version__))

### Data Set
In the following cells, we will import our dataset from a .csv file as a Pandas DataFrame.

In [None]:
# import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
# load the dataset
data = pd.read_csv('creditcard.csv')

In [None]:
# columns in dataset V1-V28 are resultS of PCA dimensionality reductions
print(data.columns)

In [None]:
print(data.shape)

In [None]:
print(data.describe())

In [None]:
#data = data.sample(frac = 0.1, random_state = 1)
print(data.shape)

In [None]:
# ploat a histogram for each parameter
data.hist(figsize = (20, 20))
plt.show()

In [None]:
# determine fraud and valid cases in sample dataset
fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]


outlier_fraction = len(fraud)/float(len(valid))
print(outlier_fraction)

print('Fraud Cases: {}'.format(len(fraud)))
print('Valid Cases: {}'.format(len(valid)))



In [None]:
# correlation matrix
corrmat = data.corr()
fig = plt.figure(figsize = (12, 9))
sb.heatmap(corrmat, vmax = 0.8, square = True)
plt.show()

In [None]:
# get all columns from dataframe
columns = data.columns.tolist()

#filter the columns to remove data we do not need
columns = [c for c in columns if c not in ['Class']]

# store variable we will be predicting on
target = 'Class'

X = data[columns]
Y =data[target]

print(X.shape)
print(Y.shape)

### Unsupervised Outlier Detection
We have processed our data, we can begin deploying our machine learning algorithms. We will use the following techniques:

### 1.  Local Outlier Factor (LOF)

The anomaly score of each sample is called Local Outlier Factor. It measures the local deviation of density of a given sample with respect to its neighbors. It is local in that the anomaly score depends on how isolated the object is with respect to the surrounding neighborhood.

### 2.  Isolation Forest Algorithm

The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.

Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node.

This path length, averaged over a forest of such random trees, is a measure of normality and our decision function.

Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies.

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# define outlier detection tools to be compared
classifiers = {
    "Isolation Forest": IsolationForest(max_samples=len(X),
                                        contamination=outlier_fraction,
                                        random_state=1),
    "Local Outlier Factor": LocalOutlierFactor(
        n_neighbors=20,
        contamination=outlier_fraction)}

In [None]:
# Fit the model
plt.figure(figsize=(9, 7))
plt.show()
n_outliers = len(fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):
    # fit the data and tag outliers
    if clf_name == 'Local Outlier Factor':
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
        
        
    # Reshape the prediction values to 0 for valid, 1 for fraud    
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    
    n_errors = (y_pred != Y).sum()
    
    #Run classification metrics
    print('{}: {}'.format(clf_name, n_errors))
    print(accuracy_score(Y, y_pred))
    print(classification_report(Y, y_pred))