# Transaction Sequencing & Batch creation

### Import statements and data loading

In [1]:
import sys
sys.path.append('../')

import pandas as pd
from itertools import dropwhile

import numpy as np

df = pd.read_csv('../../datasets/modified/modified_sparkov_dataframe.csv', index_col='Unnamed: 0')
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,unix_time,...,merch_long,is_fraud,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,2019-01-01 00:00:18,146,585,12,-0.408741,1,29.0,93.0,-0.282429,1325376018,...,25.0,0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,2019-01-01 00:00:44,51,105,2,0.233378,1,62.0,50.0,-0.293527,1325376044,...,79.0,0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,2019-01-01 00:00:51,860,366,7,0.942184,-1,69.0,95.0,-0.280243,1325376051,...,73.0,0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,2019-01-01 00:01:16,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,1325376076,...,72.0,0,51,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,2019-01-01 00:03:06,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,1325376186,...,93.0,0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


In [2]:
df.index

Int64Index([      0,       1,       2,       3,       4,       5,       6,
                  7,       8,       9,
            ...
            1852384, 1852385, 1852386, 1852387, 1852388, 1852389, 1852390,
            1852391, 1852392, 1852393],
           dtype='int64', length=1852394)

It is important things to note that the dataset is **by card and chronological order**

---

### Creating sequences

A sequence consists of ALL transactions belonging to a single entity, or credit card. 

To create sequences, it can be done by simply executing ```pandas.groupby()``` to group transactions by card, which in each group, it is sorted chronologically

In [3]:
CUTOFF_LENGTH = 100
def get_sequences(group: pd.DataFrame):
    # Get the transactions for the group

    # Order the transactions by 'unix_time'
    transaction_ids = group.index.values.tolist()

    # Generate sequences using a sliding window
    sequences = []
    for i in range(len(transaction_ids)-CUTOFF_LENGTH-1):
        sequence = [transaction_ids[j] for j in range(i, i+CUTOFF_LENGTH)]
        sequences.append(sequence)

    return sequences

In [4]:
groups = df.groupby('cc_num')
sequences = groups.apply(get_sequences)


In [5]:
sequences = [seq for subseqs in sequences for seq in subseqs]
len(sequences)


1759782

#### Convert dataset and sequence list to numpy arrays

In [6]:
df.drop(['trans_date_trans_time', 'unix_time'],axis=1,inplace=True)
df

Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,is_fraud,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,146,585,12,-0.408741,1,29.0,93.0,-0.282429,49.0,25.0,0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,51,105,2,0.233378,1,62.0,50.0,-0.293527,57.0,79.0,0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,860,366,7,0.942184,-1,69.0,95.0,-0.280243,63.0,73.0,0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,696,39,1,-0.157381,-1,37.0,66.0,-0.287590,7.0,72.0,0,51,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,195,521,11,-0.176470,-1,62.0,79.0,-0.293693,58.0,93.0,0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,371,476,10,-0.165105,-1,62.0,22.0,-0.292300,57.0,35.0,0,54,-0.259567,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664
1852390,55,207,5,0.262326,-1,37.0,51.0,-0.198697,53.0,66.0,0,21,-0.415022,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664
1852391,87,205,5,0.105595,1,37.0,46.0,-0.281802,7.0,82.0,0,39,-0.618601,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664
1852392,210,677,14,-0.389777,-1,42.0,80.0,-0.293593,51.0,83.0,0,55,-0.363512,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664


In [None]:
transaction_numpy = df.to_numpy()

np.savetxt('../../datasets/modified/modified_sparkov_numpy.csv', transaction_numpy, delimiter=',')
np.savetxt('../../datasets/modified/sparkov_sequences.csv', sequences, delimiter=',')