In [None]:
import numpy as np
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df=pd.read_csv('D://creditcardfraud/creditcard.csv',sep=',')
print(df.columns)
df

In [None]:
df.head()

In [None]:
df[['Time','Amount','Class']]

In [None]:
df['Class'].unique()

In [None]:
# 1=fraud and 0=geniune transaction
# graph btw TRUE AND FALSE transactions

import matplotlib.pyplot as plt
plt.scatter(df['Class'],df['Amount'])
plt.xlabel('Genuine or Fraud')
plt.ylabel('Amount')
plt.title('Treansactions - Genuine or Fraud vs Amount')

fraud=df[df['Class']==1]
geniune=df[df['Class']==0]

In [None]:
# plot a correlations matrix to chEck what factors actually affect the amount
df.corr()

# Create a correlation Matrix to compare the variables
import statsmodels.api as sm
sm.graphics.plot_corr(df.corr(),xnames=df.columns)

In [None]:
print(fraud.shape,geniune.shape)

In [None]:
fraud.Amount.describe()

In [None]:
geniune.Amount.describe()

In [None]:
f,(df1, df2) = plt.subplots(2, 1, sharex=True)

df1.hist(fraud.Amount)
df1.set_title('Fraud')
df2.hist(geniune.Amount)
df2.set_title('Genuine')
plt.xlabel('Amount')
plt.ylabel('no. of tansactions')
plt.show()


In [None]:
f,(df1, df2) = plt.subplots(2, 1, sharex=True)

df1.scatter(fraud.Time,fraud.Amount)
df1.set_title('Fraud')
df2.scatter(geniune.Time,geniune.Amount)
df2.set_title('Genuine')
plt.xlabel('Time')
plt.ylabel('Amount')
plt.show()


In [None]:
# now we will work on a smaller data

data=df.sample(frac=0.1,random_state=1)
data.shape

In [None]:
#Create independent and Dependent Features
columns = data.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting 
target = "Class"
# Define a random state 
state = np.random.RandomState(42)
X = data[columns]
Y = data[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)


In [None]:
## 1. Isolation Forest Algorithm: 
#One of the newest techniques to detect anomalies is called Isolation Forests. The algorithm is based on the fact that anomalies are data points that are few and different. As a result of these properties, anomalies are susceptible to a mechanism called isolation.
#This method is highly useful and is fundamentally different from all existing methods. It introduces the use of isolation as a more effective and efficient means to detect anomalies than the commonly used basic distance and density measures. Moreover, this method is an algorithm with a low linear time complexity and a small memory requirement. It builds a good performing model with a small number of trees using small sub-samples of fixed size, regardless of the size of a data set.

#Typical machine learning methods tend to work better when the patterns they try to learn are balanced, meaning the same amount of good and bad behaviors are present in the dataset.

## How Isolation Forests Work

   #The Isolation Forest algorithm isolates observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. The logic argument goes: isolating anomaly observations is easier because only a few conditions are needed to separate those cases from the normal observations. On the other hand, isolating normal observations require more conditions. Therefore, an anomaly score can be calculated as the number of conditions required to separate a given observation.

#The way that the algorithm constructs the separation is by first creating isolation trees, or random decision trees. Then, the score is calculated as the path length to isolate the observation.

#### 2. Local Outlier Factor(LOF) Algorithm

#The LOF algorithm is an unsupervised outlier detection method which computes the local density deviation of a given data point with respect to its neighbors. It considers as outlier samples that have a substantially lower density than their neighbors.

#The number of neighbors considered, (parameter n_neighbors) is typically chosen 1) greater than the minimum number of objects a cluster has to contain, so that other objects can be local outliers relative to this cluster, and 2) smaller than the maximum number of close by objects that can potentially be local outliers. In practice, such informations are generally not available, and taking n_neighbors=20 appears to work well in general.


In [None]:
import sklearn
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report,accuracy_score




In [None]:
classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
                                       random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None),
    "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05, 
                                         max_iter=-1, random_state=state)}

In [None]:
n_outliers = len(fraud)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    elif clf_name == "Support Vector Machine":
        clf.fit(X)
        y_pred = clf.predict(X)
    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))