### Notebook that prepares data for the SFI Masterclass



Writes to /data (observing that max. size for data in Github is ~100 MB)

In [None]:
import os
import sys
import pandas as pd
import random
import numpy as np

### Unsupervised: creditcard data

https://www.kaggle.com/mlg-ulb/creditcardfraud

In [None]:
N = 30000 # Final number of samples

In [None]:
df = pd.read_csv(r'../bigdata/creditcard.csv')

In [None]:
# df = df.drop(columns=['Time'])

In [None]:
import seaborn as sns 
sns.heatmap(df.corr())

In [None]:
# downsample and obfuscate the data
df_sampled = df.sample(N).reset_index(drop=True)
df_sampled['Time'] = df.head(N)['Time']

In [None]:
df_sampled.shape

In [None]:
df_sampled = df_sampled.astype('float16')

In [None]:
original_column_names = list(df_sampled.columns)
new_v_columns = [col for col in df_sampled.columns if 'V' in col]
random.seed(2)
random.shuffle(new_v_columns)
df_sampled = df_sampled[['Time', ] + new_v_columns + original_column_names[-2:]]
df_sampled.columns = original_column_names

In [None]:
X = df_sampled.iloc[:, :-1]
y = df_sampled.Class.astype(int)

In [None]:
X.to_csv(r'../data/X_unsupervised.csv.zip', compression='zip', index=False)
y.to_csv(r'../data/y_unsupervised.csv.zip', compression='zip', index=False)

In [None]:
X_test = pd.read_csv(r'../data/X_unsupervised.csv.zip')

In [None]:
X_test.head()

In [None]:
X_test.shape

In [None]:
y_test = pd.read_csv(r'../data/y_unsupervised.csv.zip')

In [None]:
y_test.shape

### Supervised: Elliptic dataset

- Take all transactions (licit, illicit, unknown): fits within 50 MB with precautions
- Split into train and test (test samples have later time stamps than train samples)

In [None]:
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
classes = pd.read_csv(r'/Users/ernstoldenhof/Projects/DATA/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')

In [None]:
# edges = pd.read_csv(r'/Users/ernstoldenhof/Projects/DATA/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')

In [None]:
features = pd.read_csv(r'/Users/ernstoldenhof/Projects/DATA/elliptic_bitcoin_dataset/elliptic_txs_features.csv',
                      header=None)

In [None]:
classes['class'].value_counts(normalize=False)

In [None]:
features.shape


In [None]:
colNames1 = {'0': 'txId', 1: "Time step"}
colNames2 = {str(ii+2): "local_feat_" + str(ii+1) for ii in range(93)}
colNames3 = {str(ii+95): "agg_feat_" + str(ii+1) for ii in range(72)}

colNames = dict(colNames1, **colNames2, **colNames3 )
colNames = {int(jj): item_kk for jj,item_kk in colNames.items()}

In [None]:
features = features.rename(columns=colNames)

In [None]:
features.head()

In [None]:
## First option: only positive and negative. Second: all

features = features.merge(classes[classes['class'] != 'unknown'], how='inner', on='txId')
# features = features.merge(classes, how='inner', on='txId')

In [None]:
features['class'] = features['class'].map({'1':1, '2':0, 'unknown':-1}).astype(np.int8)
features.iloc[:, 2:-1] = features.iloc[:, 2:-1].astype(np.float16)


In [None]:
# Determine suitable split time for train/test
time_class_counts = features.groupby(['Time step', 'class']).size().unstack()
time_class_counts['sum'] = time_class_counts.sum(axis=1)
time_class_counts.cumsum() / time_class_counts.sum() # Time 37 seems okay for cutoff

In [None]:
features_train = features[features['Time step'] <=37 ]
features_test = features[features['Time step'] > 37 ]
print(len(features_train) / len(features))

In [None]:
X_train = features_train.drop(columns='class')
X_test = features_test.drop(columns='class')
y_train = features_train['class']
y_test = features_test['class']


In [None]:
X_train.to_csv(r'../data/X_train_supervised.csv.zip', compression='zip', index=False)
y_train.to_csv(r'../data/y_train_supervised.csv.zip', compression='zip', index=False)
X_test.to_csv(r'../data/X_test_supervised.csv.zip', compression='zip', index=False)
y_test.to_csv(r'../data/y_test_supervised.csv.zip', compression='zip', index=False)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_test