# Dataset Generation

Import statements & function definitions:

In [7]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np

In [8]:
CUTOFF_LENGTH = 100

def create_sequences(group: pd.DataFrame, criteria, labels):
    # Order the transactions by 'unix_time'
    group_ids = group.index.values.tolist()

    # Generate sequences using a sliding window
    sequences = []
    for i in range(len(group_ids)-CUTOFF_LENGTH-1):
        seq = [group_ids[j] for j in range(i, i+CUTOFF_LENGTH)]
        if criteria(seq, labels):
            sequences.append(seq)
        
    return sequences

## 1. Training set

Load training set:

In [9]:
df = pd.read_csv('../../../data/modified/modified_fraudTrain.csv', index_col='Unnamed: 0')
df.head()


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,unix_time,...,merch_long,is_fraud,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,2019-01-01 00:00:18,146,585,12,-0.408741,1,29.0,93.0,-0.282429,1325376018,...,25.0,0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,2019-01-01 00:00:44,51,105,2,0.233378,1,62.0,50.0,-0.293527,1325376044,...,79.0,0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,2019-01-01 00:00:51,860,366,7,0.942184,-1,69.0,95.0,-0.280243,1325376051,...,73.0,0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,2019-01-01 00:01:16,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,1325376076,...,72.0,0,52,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,2019-01-01 00:03:06,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,1325376186,...,93.0,0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


### 1.1. Creating sequences
A sequence consists of ALL transactions belonging to a single entity, or credit card. 

To create sequences, it can be done by simply executing ```pandas.groupby()``` to group transactions by card, which in each group, it is sorted chronologically

In [10]:
labels = df['is_fraud'].to_numpy()
groups = df.groupby('cc_num')

fraud = groups.apply(create_sequences, criteria=lambda sequences, labels: labels[sequences[-1]] == 1, labels=labels)
non_fraud = groups.apply(create_sequences, criteria=lambda sequences, labels: labels[sequences[-1]] == 0, labels=labels)
all = groups.apply(create_sequences, criteria=lambda sequences, labels: True, labels=labels)

fraud = [seq for subseqs in fraud for seq in subseqs]
non_fraud = [seq for subseqs in non_fraud for seq in subseqs]
all = [seq for subseqs in all for seq in subseqs]

np.save('../../../data/train/sequences/fraud_train_seq', fraud)
np.save('../../../data/train/sequences/non_fraud_train_seq', non_fraud)

del fraud
del non_fraud


### 1.2. Transforming transactions to numpy arrays

In [11]:
df.drop(['trans_date_trans_time', 'unix_time', 'is_fraud'],axis=1,inplace=True)
df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,146,585,12,-0.408741,1,29.0,93.0,-0.282429,49.0,25.0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,51,105,2,0.233378,1,62.0,50.0,-0.293527,57.0,79.0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,860,366,7,0.942184,-1,69.0,95.0,-0.280243,63.0,73.0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,7.0,72.0,52,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,58.0,93.0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


In [12]:
np.save('../../../data/train/transactions', df.to_numpy())
np.save('../../../data/train/all_transaction_labels', labels)


# 2. Test set

For the test set, it will not be necessary since the model will only predict single inputs

In [13]:
df = pd.read_csv('../../../data/modified/modified_fraudTest.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,...,merch_long,is_fraud,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,1296675,2020-06-21 12:14:25,288,442,9,-0.42199,-1,63.0,72.0,0.812151,...,93.0,0,52,1.426876,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
1,1296676,2020-06-21 12:14:33,157,392,9,-0.252575,1,62.0,100.0,-0.293019,...,82.0,0,30,-0.10296,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
2,1296677,2020-06-21 12:14:53,95,468,10,-0.18074,1,62.0,67.0,-0.179602,...,62.0,0,49,-0.297127,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
3,1296678,2020-06-21 12:15:15,460,504,11,-0.062878,-1,37.0,72.0,-0.112365,...,67.0,0,32,0.987213,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
4,1296679,2020-06-21 12:15:17,165,649,14,-0.419918,-1,47.0,42.0,-0.290286,...,32.0,0,65,-0.406467,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575


In [14]:
df['is_fraud'].value_counts()

is_fraud
0    553574
1      2145
Name: count, dtype: int64

sequences per card

In [15]:
labels = df['is_fraud'].to_numpy()
groups = df.groupby('cc_num')

In [16]:
sequences = groups.apply(create_sequences, criteria=lambda sequences, labels: True, labels=labels)

sequences = [seq for subseqs in sequences for seq in subseqs]

sequence_labels = np.apply_along_axis(lambda seq: labels[seq[-1]], 1, sequences)

np.save('../../../data/test/all_test_seq', sequences)
np.save('../../../data/test/all_test_seq_labels', sequence_labels)

del sequences
del sequence_labels

In [17]:
df.drop(['trans_date_trans_time', 'unix_time', 'is_fraud', 'Unnamed: 0'],axis=1,inplace=True)

np.save('../../../data/test/transactions', df.to_numpy())
np.save('../../../data/test/all_transaction_labels', labels)
