Isolation forest implementation with 3 preprocessed datasets:

1. PCA features
2. Correlation features
3. Data time features (time series)

In [1]:
#importing relevant libraries
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

from imblearn.over_sampling import RandomOverSampler

In [2]:
#ounting to a google drive for the datasets
drive.mount('/content/drive')

Mounted at /content/drive


#PCA dataset

In [3]:
#data reading
df_pca = pd.read_csv('/content/drive/My Drive/DATA607_Project/master_dataset_pca_features.csv')
df_pca

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,PCA_40,PCA_41,PCA_42,PCA_43,PCA_44,PCA_45,PCA_46,PCA_47,PCA_48,PCA_49
0,0,86400,68.50,4,10095,0,43,2,39,2,...,-0.045024,0.048300,0.056854,0.012628,0.181536,-0.167985,0.126458,0.496288,0.335819,-0.236719
1,0,86401,29.00,4,1372,304,43,3,3,2,...,0.095783,0.173252,0.199942,0.074374,0.161896,-0.218850,0.091291,0.348468,0.067023,-0.252453
2,0,86469,59.00,4,2833,390,43,4,59,3,...,-0.099595,-0.064076,-0.061770,0.032711,0.100366,-0.132302,0.072217,0.007189,0.027657,-0.111489
3,0,86499,50.00,4,13341,467,43,3,15,3,...,-0.740266,0.085678,-0.648844,-1.026561,1.054275,0.582103,1.188664,-0.052836,-0.920947,-0.141513
4,0,86506,50.00,1,2712,414,43,3,3,2,...,-0.070060,-0.030934,-0.020249,0.107135,-0.027593,-0.133552,0.024145,0.120825,0.103854,0.091361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,15811047,49.00,4,4305,0,43,4,109,3,...,-0.160291,-0.443826,-0.505521,0.175586,-0.573761,0.323172,-0.120522,-0.892605,-0.385490,0.210411
590536,0,15811049,39.50,4,7354,125,43,3,107,3,...,-0.099595,-0.064076,-0.061770,0.032711,0.100366,-0.132302,0.072217,0.007189,0.027657,-0.111489
590537,0,15811079,30.95,4,8621,495,43,3,107,3,...,-0.077440,-0.185963,0.024062,-0.036038,0.170595,-0.082324,0.099207,-0.128432,0.087943,-0.087698
590538,0,15811088,117.00,4,5297,381,43,3,107,3,...,0.299672,-0.224988,0.923288,-0.368491,1.805792,-1.478866,-1.812061,0.816851,1.522625,-0.203661


In [4]:
#data splitting - no cross validation used therefore we only have a train test split and not a validation set
pca_train, pca_test = np.split(df_pca.sample(frac=1), [int(0.7 * len(df_pca))])

  return bound(*args, **kwds)


In [5]:
#preparing train and test sets
y_train = pca_train.pop("isFraud")
X_train = pca_train
y_test = pca_test.pop("isFraud")
X_test = pca_test

Training

In [6]:
#defining our model and model fitting
model =  IsolationForest(n_estimators=100, contamination='auto', max_features = 50, random_state=42)
model.fit(X_train)  #unsupervised algorithm therefore no labels passed

Prediction

In [7]:
#performing predictions on test set
scores_prediction = model.decision_function(X_test)
y_pred = model.predict(X_test)
y_pred[y_pred == 1] = 0   # 1 means closer to normal - not an outlier/anomaly
y_pred[y_pred == -1] = 1 # -1 means an outlier/anomaly

print("Accuracy in finding anomaly:",accuracy_score(y_test ,y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy in finding anomaly: 0.9402129124755874
ROC AUC Score: 0.6128071042827972
              precision    recall  f1-score   support

           0       0.97      0.96      0.97    171029
           1       0.21      0.26      0.23      6133

    accuracy                           0.94    177162
   macro avg       0.59      0.61      0.60    177162
weighted avg       0.95      0.94      0.94    177162



#Correlation dataset

In [8]:
#data reading
df_corr = pd.read_csv('/content/drive/My Drive/DATA607_Project/master_dataset_corr_features.csv')
df_corr

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.50,4,10095,500,42,1,38,1,...,130,4,260,4,2,2,2,2,2,1786
1,0,86401,29.00,4,1372,303,42,2,2,1,...,130,4,260,4,2,2,2,2,2,1786
2,0,86469,59.00,4,2833,389,42,3,58,2,...,130,4,260,4,2,2,2,2,2,1786
3,0,86499,50.00,4,13341,466,42,2,14,2,...,130,4,260,4,2,2,2,2,2,1786
4,0,86506,50.00,1,2712,413,42,2,2,1,...,123,3,164,3,1,0,1,1,1,954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,15811047,49.00,4,4305,500,42,3,108,2,...,130,4,260,4,2,2,2,2,2,1786
590536,0,15811049,39.50,4,7354,124,42,2,106,2,...,130,4,260,4,2,2,2,2,2,1786
590537,0,15811079,30.95,4,8621,494,42,2,106,2,...,130,4,260,4,2,2,2,2,2,1786
590538,0,15811088,117.00,4,5297,380,42,2,106,2,...,130,4,260,4,2,2,2,2,2,1786


In [9]:
#data splitting - no cross validation used therefore we only have a train test split and not a validation set
corr_train, corr_test = np.split(df_corr.sample(frac=1), [int(0.7 * len(df_corr))])

  return bound(*args, **kwds)


In [10]:
#preparing train and test sets
y_train = corr_train.pop("isFraud")
X_train = corr_train
y_test = corr_test.pop("isFraud")
X_test = corr_test

Training

In [11]:
#defining our model and model fitting
model =  IsolationForest(n_estimators=100, contamination='auto', max_features = 50, random_state=42)  #setting contamination to "auto" because we do not know the expected proportion of outliers
model.fit(X_train) #unsupervised algorithm therefore no labels passed

Prediction

In [12]:
#performing predictions on test set
scores_prediction = model.decision_function(X_test)
y_pred = model.predict(X_test)
y_pred[y_pred == 1] = 0   # 1 means closer to normal - not an outlier/anomaly
y_pred[y_pred == -1] = 1 # -1 means an outlier/anomaly

print("Accuracy in finding anomaly:",accuracy_score(y_test ,y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy in finding anomaly: 0.9278231223400052
ROC AUC Score: 0.6193595786286142
              precision    recall  f1-score   support

           0       0.97      0.95      0.96    170991
           1       0.17      0.29      0.22      6171

    accuracy                           0.93    177162
   macro avg       0.57      0.62      0.59    177162
weighted avg       0.95      0.93      0.94    177162



#Time series dataset

In [2]:
df_dt_label = pd.read_csv('/content/drive/My Drive/DATA607_Project/master_dataset_DTfeatures_LabelEncoded.csv')
df_dt_label

Unnamed: 0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,PCA_47,PCA_48,PCA_49,hour,dayofweek,quarter,month,dayofyear,dayofmonth,weekofyear
0,0,68.50,4,10095,0,43,2,39,2,167,...,0.496288,0.335819,-0.236719,0,4,4,12,335,1,48
1,0,29.00,4,1372,304,43,3,3,2,174,...,0.348468,0.067023,-0.252453,0,4,4,12,335,1,48
2,0,59.00,4,2833,390,43,4,59,3,179,...,0.007189,0.027657,-0.111489,0,4,4,12,335,1,48
3,0,50.00,4,13341,467,43,3,15,3,283,...,-0.052836,-0.920947,-0.141513,0,4,4,12,335,1,48
4,0,50.00,1,2712,414,43,3,3,2,242,...,0.120825,0.103854,0.091361,0,4,4,12,335,1,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,49.00,4,4305,0,43,4,109,3,133,...,-0.892605,-0.385490,0.210411,23,3,2,5,151,31,22
590536,0,39.50,4,7354,125,43,3,107,3,79,...,0.007189,0.027657,-0.111489,23,3,2,5,151,31,22
590537,0,30.95,4,8621,495,43,3,107,3,99,...,-0.128432,0.087943,-0.087698,23,3,2,5,151,31,22
590538,0,117.00,4,5297,381,43,3,107,3,220,...,0.816851,1.522625,-0.203661,23,3,2,5,151,31,22


In [3]:
def datasplit(dataframe, oversample=False):
    #setting data thresholds
    train_df = dataframe[(dataframe["month"] < 5) | (dataframe["month"] == 12)]    # setting first 5 months for training
    test_df = dataframe[dataframe["month"] == 5]  #  setting the last month for testing

    #splitting data for supervised learning
    X_train, y_train = train_df.drop(columns=["isFraud"]), train_df["isFraud"]
    X_test, y_test = test_df.drop(columns=["isFraud"]), test_df["isFraud"]

    #for unsupervised learning, drop 'isFraud' column from X_train and X_test
    X_train_unsupervised = X_train.drop(columns=["isFraud"], errors="ignore")
    X_test_unsupervised = X_test.drop(columns=["isFraud"], errors="ignore")

    #handling oversampling if needed
    if oversample:
        ros = RandomOverSampler()
        X_train, y_train = ros.fit_resample(X_train, y_train)

    return X_train_unsupervised, X_test_unsupervised, y_train, y_test

In [4]:
#data splitting - no cross validation used therefore we only have a train test split and not a validation set
X_train_unsupervised, X_test_unsupervised, y_train, y_test = datasplit(df_dt_label, oversample=True)

Training

In [6]:
#defining our model and model fitting
model =  IsolationForest(n_estimators=100, contamination='auto', max_features = 50, random_state=42)  #setting contamination to "auto" because we do not know the expected proportion of outliers
model.fit(X_train_unsupervised) #unsupervised algorithm therefore no labels passed

Prediction

In [7]:
#performing predictions on test set
scores_prediction = model.decision_function(X_test_unsupervised)
y_pred = model.predict(X_test_unsupervised)
y_pred[y_pred == 1] = 0   # 1 means closer to normal - not an outlier/anomaly
y_pred[y_pred == -1] = 1 # -1 means an outlier/anomaly

print("Accuracy in finding anomaly:",accuracy_score(y_test ,y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy in finding anomaly: 0.9228556075498735
ROC AUC Score: 0.6091811030811383
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     86212
           1       0.15      0.27      0.20      3114

    accuracy                           0.92     89326
   macro avg       0.56      0.61      0.58     89326
weighted avg       0.94      0.92      0.93     89326



##Original master dataset

I was unable to perform isolation forest on the original dataset because it was computationally taxing.