[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io)

# Anomaly Detection - Isolation Forest

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 0.1.000 | 25/10/2022 | Royi Avital | First version                                                      |
|         |            |             |                                                                    |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/IntroductionMachineLearningSystemEngineers/AnomalyDetectorIsolationForest.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import average_precision_score, auc, confusion_matrix, f1_score, precision_recall_curve, roc_curve, ConfusionMatrixDisplay, PrecisionRecallDisplay

# Misc
import datetime
import os
from platform import python_version
import random
import warnings
import yaml

# Typing
from typing import Tuple

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter
from IPython import get_ipython
from IPython.display import Image, display
from ipywidgets import Dropdown, FloatSlider, interact, IntSlider, Layout

In [None]:
# Configuration
#%matplotlib inline

warnings.filterwarnings("ignore")

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# sns.set_theme() #>! Apply SeaBorn theme
sns.set_palette("tab10")

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2


In [None]:
# Fixel Algorithms Packages


In [None]:
# Parameters

# Data
csvFilePath = r'../DataSets/creditcard.csv'
csvFileUrl  = r'https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv' #<! Seems to be a lower accuracy representation (Less digits), Yet available on GitHub

# Models
numTrees = 50

# ROC
numGrdiPts = 201

In [None]:
# Auxiliary Functions




## Generate Data


In [None]:
# Generate / Load Data 

if os.path.isfile(csvFilePath):
    dfData = pd.read_csv(csvFilePath)
else:
    dfData = pd.read_csv(csvFileUrl)


In [None]:
dfData.head()

### Analysis of the Data

Pay attnetion that in order to keep the data private, it was mixed (Probably with PCA like procedure).  
Hence the features are not "physical".

In [None]:
hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
sns.countplot(x = dfData['Class'], ax = hA)
hA.set_title('The Balance of Classes in Data')
plt.show()

In [None]:
dsClassBalance = dfData['Class'].value_counts(normalize = True)
dClassBalance = dsClassBalance.to_dict()

In [None]:
hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
sns.histplot(data = dfData, x = 'Class', stat = 'percent', discrete = True, shrink = .8, ax = hA)
# sns.histplot(data = dfData, x = 'Class', discrete = True, shrink = .8, ax = hA)
hA.set_xticks([0, 1])
hA.set_title(f'The Balance of Classes in Data: {dClassBalance}')
plt.show()

Basically the class of 1, the fraudulent transactions, are anomalies in the data.

In [None]:
# Data for Processing

mX = dfData.drop(columns = ['Time', 'Class']).values
vY = dfData['Class'].values
mX.shape, vY.shape

## Detect Outliers / Anomalies

We'll detect anomalies using 2 methods:

1. Supervised: Random Forest Classifier.
2. UnSupervised: Isolation Forest.

In [None]:
# Random Forest Classifier (Supervised)
# We use the out of bag (Samples which are not used in the i-th tree) to estimate the actual score on the data set
oRndForest = RandomForestClassifier(n_estimators = numTrees, oob_score = True, n_jobs = -1, random_state = seedNum).fit(mX, vY) #<! We accelerate it in parallel

In [None]:
# Isolation forest (UnSupervised)
oIsoForest = IsolationForest(n_estimators = numTrees, n_jobs = -1, random_state = seedNum).fit(mX)

### Analysis of the ROC

In [None]:
vScoreRF =  oRndForest.oob_decision_function_[:, 1] #<! Score for Label 1
vScoreIF = -oIsoForest.decision_function(mX)

In [None]:
len(np.unique(vScoreRF))

In [None]:
vFP_RF, vTP_RF, vThersholdRF = roc_curve(vY, vScoreRF, pos_label = 1)
vFP_IF, vTP_IF, vThersholdIF = roc_curve(vY, vScoreIF, pos_label = 1)

AUC_RF = auc(vFP_RF, vTP_RF)
AUC_IF = auc(vFP_IF, vTP_IF)

In [None]:
hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
hA.plot(vFP_RF, vTP_RF, color = 'b', lw = 3, label = f'RF  AUC = {AUC_RF :.3f} (Out of Bag Score)')
hA.plot(vFP_IF, vTP_IF, color = 'r', lw = 3, label = f'IF  AUC = {AUC_IF :.3f}')
hA.plot([0, 1], [0, 1], color = 'k', lw = 2, linestyle = '--')
hA.set_title ('ROC')
hA.set_xlabel('False Positive Rate')
hA.set_ylabel('True Positive Rate')
hA.axis ('equal')
hA.legend()
hA.grid()
plt.show()

In this example an UnSupervised method is giving a fight to a supervised method!
At least by looking at the ROC / AUC. Is that for real? Let's have a look at the _Confusion Matrix_.

## Compare Performance for Different Threshold Level

In [None]:
v              = np.linspace(0, 1, numGrdiPts, endpoint = True)
vThersholdRF2  = np.interp(v, vFP_RF, vThersholdRF)
vThersholdIF2  = np.interp(v, vFP_IF, vThersholdIF)

In [None]:
def PlotConfusionMatrices(thr):
    
    thrRF    = vThersholdRF2[thr]
    thrIF    = vThersholdIF2[thr]
    vHatY_RF = vScoreRF > thrRF
    vHatY_IF = vScoreIF > thrIF
        
    mC_RF = confusion_matrix(vY, vHatY_RF)
    mC_IF = confusion_matrix(vY, vHatY_IF)
    
    fig = plt.figure(figsize = (12, 8))
    ax  = fig.add_subplot(1, 2, 1)
    ax.plot(vFP_RF, vTP_RF, color='b', lw=3, label=f'RF AUC = {AUC_RF :.3f} (On train data)')
    ax.plot(vFP_IF, vTP_IF, color='r', lw=3, label=f'IF AUC = {AUC_IF :.3f}')
    ax.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
    ax.axvline(x = thr / (numGrdiPts - 1), color='g', lw=2, linestyle='--')
    ax.set_title ('ROC')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.axis      ('equal')
    ax.legend    ()
    ax.grid      ()    
    
    axRF = fig.add_subplot(2, 3, 3)
    axIF = fig.add_subplot(2, 3, 6)
    
    ConfusionMatrixDisplay(mC_RF, display_labels=['Normal', 'Fruad']).plot(ax=axRF)
    ConfusionMatrixDisplay(mC_IF, display_labels=['Normal', 'Fruad']).plot(ax=axIF)
    axRF.set_title('Random Forest   \n' f'f1_score = {f1_score(vY, vHatY_RF):1.4f}')
    axIF.set_title('Isolation Forest\n' f'f1_score = {f1_score(vY, vHatY_IF):1.4f}')
    plt.show        ()
    
thrSlider = IntSlider(min = 0, max = numGrdiPts - 1, step = 1, value = 0, layout = Layout(width='30%'))
interact(PlotConfusionMatrices, thr = thrSlider)
plt.show()

It seems the Random Forest data is highly quantized (No ability to play with the threshold).

### View by Precision Recall Curve

For highly imbalnced data, the Precision Recall Curve is usually a better tool to analyze performance.

**Remark**: The _Precision Recall Curve_ isn't guaranteed to be monotonic.

In [None]:
vPR_RF, vRE_RF, vThersholdPrReRF = precision_recall_curve(vY, vScoreRF, pos_label = 1)
vPR_IF, vRE_IF, vThersholdPrReIF = precision_recall_curve(vY, vScoreIF, pos_label = 1)

# Avergae Precision Score, Somewhat equivalent to the AUC for the PR Curve
AUC_PrReRF = average_precision_score(vY, vScoreRF, pos_label = 1)
AUC_PrReIF = average_precision_score(vY, vScoreIF, pos_label = 1)

In [None]:
hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
hA.plot(vRE_RF, vPR_RF, color = 'b', lw = 3, label = f'RF  Average Precision = {AUC_PrReRF :.3f} (Out of Bag Score)')
hA.plot(vRE_IF, vPR_IF, color = 'r', lw = 3, label = f'IF  Average Precision = {AUC_PrReIF :.3f}')
hA.set_title ('Precision Recall Curve')
hA.set_xlabel('Recall')
hA.set_ylabel('Precision')
hA.axis('equal')
hA.legend()
hA.grid()
plt.show()

In [None]:
hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
PrecisionRecallDisplay.from_predictions(vY, vScoreIF, name = 'Isolation Forest', ax = hA)
PrecisionRecallDisplay.from_predictions(vY, vScoreRF, name = 'Random Forest', ax = hA)
hA.set_title ('Precision Recall Curve')
hA.axis('equal')
hA.grid()
plt.show()