# Dataset Generation

Import statements & function definitions:

In [1]:
import sys

sys.path.append('../')
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

## 1. Training set

Load training set:

In [2]:
df = pd.concat([pd.read_csv('../../data/raw/modified_fraudTrain.csv'), pd.read_csv('../../data/raw/modified_fraudTest.csv')], ignore_index=True)
df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,...,merch_long,is_fraud,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,0,2019-01-01 00:00:18,146,585,12,-0.408741,1,29.0,93.0,-0.282429,...,25.0,0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,1,2019-01-01 00:00:44,51,105,2,0.233378,1,62.0,50.0,-0.293527,...,79.0,0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,2,2019-01-01 00:00:51,860,366,7,0.942184,-1,69.0,95.0,-0.280243,...,73.0,0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,3,2019-01-01 00:01:16,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,...,72.0,0,52,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,4,2019-01-01 00:03:06,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,...,93.0,0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


In [3]:
df.shape

(1852394, 22)

In [4]:
train, test = np.split(df,  [int(.7*len(df))])

In [5]:
print(len(train), len(test))

1296675 555719


In [6]:
CUTOFF_LENGTH=100

In [7]:
tscv = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)

In [8]:
for fold, (train_index, val_index) in enumerate(tscv.split(train)):
    print(len(val_index))

216112
216112
216112
216112
216112


## Train Sequences

In [9]:
for fold, (train_index, val_index) in enumerate(tscv.split(train)):
    
    # NOTE Train fold
    train_split = train.iloc[train_index]
    counter = 0
    output = np.empty((0, CUTOFF_LENGTH))
    output_targets = np.empty((0,))
    for _, group in train_split.groupby('cc_num'):
        if group.shape[0] < CUTOFF_LENGTH:
            continue    # groups too small can't create sequences
        
        group_labels = group['is_fraud'].to_numpy()
        group = group.to_numpy()
        
        seqs = np.array([group[i:i+CUTOFF_LENGTH, 0] for i in range(group.shape[0]-CUTOFF_LENGTH-1)])
        targets = np.array([group_labels[i+CUTOFF_LENGTH] for i in range(group.shape[0]-CUTOFF_LENGTH-1)])

        if len(seqs) == 0: continue
        output = np.append(output, seqs, axis=0)
        output_targets = np.append(output_targets, targets, axis=0)

    print(output.shape)
    print(len(output_targets[output_targets == 1]))

    np.save(f'../../data/train/fold_{fold}/train_seq_ids', output.astype(int))
    np.save(f'../../data/train/fold_{fold}/train_seq_labels', output_targets.astype(int))


    # NOTE Validation fold
    val_split = train.iloc[val_index]
    counter = 0
    output = np.empty((0, CUTOFF_LENGTH))
    output_targets = np.empty((0,))
    for i, group in val_split.groupby('cc_num'):
        if group.shape[0] < CUTOFF_LENGTH:
            continue    # groups too small can't create sequences
        
        group_labels = group['is_fraud'].to_numpy()
        group = group.to_numpy()
        
        seqs = np.array([group[i:i+CUTOFF_LENGTH, 0] for i in range(group.shape[0]-CUTOFF_LENGTH-1)])
        targets = np.array([group_labels[i+CUTOFF_LENGTH] for i in range(group.shape[0]-CUTOFF_LENGTH-1)])

        if len(seqs) == 0: continue
        output = np.append(output, seqs, axis=0)
        output_targets = np.append(output_targets, targets, axis=0)

    print(output.shape)
    print(len(output_targets[output_targets == 1]))
    
    np.save(f'../../data/train/fold_{fold}/val_seq_ids', output.astype(int))
    np.save(f'../../data/train/fold_{fold}/val_seq_labels', output_targets.astype(int))
    print("--------")



(127741, 100)
615
(127990, 100)
544
--------
(340222, 100)
1609
(127580, 100)
443
--------
(556242, 100)
2546
(127765, 100)
438
--------
(772259, 100)
3545
(127611, 100)
586
--------
(988259, 100)
4794
(127487, 100)
516
--------


---
## Full Trainset

In [10]:
counter = 0
output = np.empty((0, CUTOFF_LENGTH))
output_targets = np.empty((0,))
for i, group in train.groupby('cc_num'):
    if group.shape[0] < CUTOFF_LENGTH:
        continue    # groups too small can't create sequences
    
    group_labels = group['is_fraud'].to_numpy()
    group = group.to_numpy()
    
    seqs = np.array([group[i:i+CUTOFF_LENGTH, 0] for i in range(group.shape[0]-CUTOFF_LENGTH-1)])
    targets = np.array([group_labels[i+CUTOFF_LENGTH] for i in range(group.shape[0]-CUTOFF_LENGTH-1)])
    output = np.append(output, seqs, axis=0)
    output_targets = np.append(output_targets, targets, axis=0)


output.shape

(1204226, 100)

In [11]:
np.save('../../data/train/all_seq_ids', output.astype(int))
np.save('../../data/train/all_seq_labels', output_targets.astype(int))

In [7]:
train_labels = train['is_fraud']
train_labels.to_numpy()

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
train.reset_index(inplace=True, drop=True)
train.drop(['trans_date_trans_time', 'is_fraud', 'unix_time', 'Unnamed: 0'],axis=1,inplace=True)
train.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,146,585,12,-0.408741,1,29.0,93.0,-0.282429,49.0,25.0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,51,105,2,0.233378,1,62.0,50.0,-0.293527,57.0,79.0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,860,366,7,0.942184,-1,69.0,95.0,-0.280243,63.0,73.0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,7.0,72.0,52,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,58.0,93.0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


In [9]:
np.save('../../data/train/all_transactions', train.to_numpy())
np.save('../../data/train/all_labels', train_labels.to_numpy())

--- 
## Test set

need to also save transactions for test production

In [12]:
test_labels = test['is_fraud']
test_labels.to_numpy()

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
test.reset_index(inplace=True, drop=True)
test.drop(['trans_date_trans_time', 'is_fraud', 'unix_time', 'Unnamed: 0'],axis=1,inplace=True)
test.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,288,442,9,-0.42199,-1,63.0,72.0,0.812151,69.0,93.0,52,1.426876,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
1,157,392,9,-0.252575,1,62.0,100.0,-0.293019,57.0,82.0,30,-0.10296,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
2,95,468,10,-0.18074,1,62.0,67.0,-0.179602,57.0,62.0,49,-0.297127,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
3,460,504,11,-0.062878,-1,37.0,72.0,-0.112365,58.0,67.0,32,0.987213,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575
4,165,649,14,-0.419918,-1,47.0,42.0,-0.290286,41.0,32.0,65,-0.406467,0.859281,0.428411,-1.032251,0.671264,0.197699,-1.392575


In [14]:
np.save('../../data/test/all_transactions', test.to_numpy())
np.save('../../data/test/all_labels', test_labels.to_numpy())

## Save all transactions

In [15]:
labels = df['is_fraud']
df.drop(['trans_date_trans_time', 'is_fraud', 'unix_time', 'Unnamed: 0'],axis=1,inplace=True)
df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,146,585,12,-0.408741,1,29.0,93.0,-0.282429,49.0,25.0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,51,105,2,0.233378,1,62.0,50.0,-0.293527,57.0,79.0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,860,366,7,0.942184,-1,69.0,95.0,-0.280243,63.0,73.0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,7.0,72.0,52,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,58.0,93.0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


In [16]:
np.save('../../data/all_transactions', df.to_numpy())
np.save('../../data/all_labels', labels.to_numpy())