# Transaction Sequencing & Batch creation

### Import statements and data loading

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np

df = pd.read_csv('../../datasets/modified/modified_sparkov.csv', index_col='Unnamed: 0')
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,unix_time,...,merch_long,is_fraud,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,2019-01-01 00:00:18,146,585,12,-0.408741,1,29.0,93.0,-0.282429,1325376018,...,25.0,0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,2019-01-01 00:00:44,51,105,2,0.233378,1,62.0,50.0,-0.293527,1325376044,...,79.0,0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,2019-01-01 00:00:51,860,366,7,0.942184,-1,69.0,95.0,-0.280243,1325376051,...,73.0,0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,2019-01-01 00:01:16,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,1325376076,...,72.0,0,51,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,2019-01-01 00:03:06,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,1325376186,...,93.0,0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


It is important things to note that the dataset is **by card and chronological order**

---

### Creating sequences

A sequence consists of ALL transactions belonging to a single entity, or credit card. 

To create sequences, it can be done by simply executing ```pandas.groupby()``` to group transactions by card, which in each group, it is sorted chronologically

In [2]:
CUTOFF_LENGTH = 100
def get_sequences(group: pd.DataFrame):
    # Get the transactions for the group

    # Order the transactions by 'unix_time'
    group_ids = group.index.values.tolist()

    # Generate sequences using a sliding window
    sequences = []
    for i in range(len(group_ids)-CUTOFF_LENGTH-1):
        sequence = [group_ids[j] for j in range(i, i+CUTOFF_LENGTH)]
        sequences.append(sequence)
    
    return sequences

groups = df.groupby('cc_num')
labels = df['is_fraud'].to_numpy()

sequences = groups.apply(get_sequences)
sequences = [seq for subseqs in sequences for seq in subseqs]
sequences



[[1195,
  1533,
  1805,
  2641,
  3029,
  4749,
  5685,
  7950,
  8093,
  8579,
  9069,
  9312,
  10182,
  10242,
  10247,
  10777,
  10911,
  11379,
  11548,
  11975,
  12322,
  13357,
  13639,
  13861,
  13966,
  14369,
  15113,
  15131,
  15436,
  15540,
  15820,
  16389,
  17861,
  18135,
  18593,
  18749,
  20087,
  20230,
  21417,
  21473,
  21508,
  21536,
  22644,
  22683,
  23362,
  25084,
  25251,
  25604,
  25845,
  26330,
  27565,
  28807,
  31971,
  32354,
  32707,
  33305,
  33320,
  33432,
  34557,
  35062,
  35366,
  35476,
  37031,
  37192,
  37366,
  37945,
  38092,
  38506,
  38772,
  38857,
  39046,
  39310,
  39603,
  40634,
  40751,
  43276,
  43466,
  43609,
  44864,
  44898,
  45019,
  45149,
  45775,
  46377,
  46427,
  46539,
  47023,
  47090,
  47211,
  47224,
  47669,
  48225,
  48873,
  49609,
  49761,
  50272,
  52287,
  52394,
  52852,
  53173],
 [1533,
  1805,
  2641,
  3029,
  4749,
  5685,
  7950,
  8093,
  8579,
  9069,
  9312,
  10182,
  10242,
  102

#### Matching sequences with labels

In [3]:

transaction_labels = df['is_fraud'].to_numpy()
def match_label(sequence):
    return transaction_labels[sequence[-1]]

labels = np.apply_along_axis(match_label, 1, sequences)
labels



array([0, 0, 0, ..., 0, 0, 0])

In [4]:
df['is_fraud'].value_counts()

0    1842743
1       9651
Name: is_fraud, dtype: int64

#### Convert dataset and sequence list to numpy arrays

In [5]:
df.drop(['trans_date_trans_time', 'unix_time', 'is_fraud'],axis=1,inplace=True)
df

Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,146,585,12,-0.408741,1,29.0,93.0,-0.282429,49.0,25.0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,51,105,2,0.233378,1,62.0,50.0,-0.293527,57.0,79.0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,860,366,7,0.942184,-1,69.0,95.0,-0.280243,63.0,73.0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,696,39,1,-0.157381,-1,37.0,66.0,-0.287590,7.0,72.0,51,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,195,521,11,-0.176470,-1,62.0,79.0,-0.293693,58.0,93.0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,371,476,10,-0.165105,-1,62.0,22.0,-0.292300,57.0,35.0,54,-0.259567,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664
1852390,55,207,5,0.262326,-1,37.0,51.0,-0.198697,53.0,66.0,21,-0.415022,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664
1852391,87,205,5,0.105595,1,37.0,46.0,-0.281802,7.0,82.0,39,-0.618601,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664
1852392,210,677,14,-0.389777,-1,42.0,80.0,-0.293593,51.0,83.0,55,-0.363512,-0.783235,-1.410651,0.742289,-1.477333,-0.175760,1.385664


In [6]:
df.dtypes

cc_num               int64
merchant             int64
category             int64
amt                float64
gender               int64
lat                float64
long               float64
city_pop           float64
merch_lat          float64
merch_long         float64
age                  int64
trans_timedelta    float64
trans_month_sin    float64
trans_month_cos    float64
trans_week_sin     float64
trans_week_cos     float64
trans_hour_sin     float64
trans_hour_cos     float64
dtype: object

In [7]:
transaction_numpy = df.to_numpy()

np.save('../../datasets/modified/modified_sparkov', transaction_numpy)
np.save('../../datasets/modified/sparkov_labels', labels)
np.save('../../datasets/modified/sparkov_sequences', sequences)