#### Notebook that prepares data for the SFI Masterclass

https://www.kaggle.com/mlg-ulb/creditcardfraud

Writes to /data (observing that max. size for data in Github is ~100 MB)

In [None]:
import os
import sys
import pandas as pd
import random
import numpy as np

In [None]:
df = pd.read_csv(r'../bigdata/creditcard.csv')

In [None]:
df = df.drop(columns=['Time'])

In [None]:
# downsample and obfuscate the data
df_sampled = df.sample(100000).reset_index(drop=True)


In [None]:
df_sampled = df_sampled.astype('float16')

In [None]:
original_column_names = list(df_sampled.columns)
new_v_columns = [col for col in df_sampled.columns if 'V' in col]
random.seed(2)
random.shuffle(new_v_columns)
df_sampled = df[new_v_columns + original_column_names[-2:]]
df_sampled.columns = original_column_names

In [None]:
X = df_sampled.iloc[:, :-1]
y = df_sampled.Class.astype(int)

In [100]:
X.to_csv(r'../data/X.csv', compression=None)
y.to_csv(r'../data/y.csv', compression=None)

In [None]:
X_test = pd.read_csv(r'../data/X.csv.zip')

In [None]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import MinCovDet, EmpiricalCovariance
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture


In [None]:
sys.path.append(r'../.')
!pip install seaborn
from outlierutils import plot_top_N, plot_outlier_scores

In [None]:
cov_ = EmpiricalCovariance().fit(X)
# cov_ = MinCovDet().fit(X) # Robust estimation
mahalonobis_scores = cov_.mahalanobis(X)


In [None]:
mahalonobis_scores = np.log10(mahalonobis_scores)
res = plot_outlier_scores(y.values, mahalonobis_scores, bw=0.1, title='Mahalonobis')

In [None]:
res = plot_top_N(y.values, mahalonobis_scores, N=100)

In [None]:
gmm = GaussianMixture(n_components=5, covariance_type='full', random_state=1) # try also spherical
gmm.fit(X, )
gmm_scores = - gmm.score_samples(X)

In [None]:
# gmm_scores = np.clip(gmm_scores, -15, 50)
res = plot_outlier_scores(y.values, np.log10(gmm_scores+100), bw=0.1, title='Pen digits, Mahalonobis (GMM)')

In [None]:
res = plot_top_N(y.values, gmm_scores, N=100)