# Dataset Generation

### 1. Import statements and data loading

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np

raw = pd.read_csv('../../../data/modified/transactions/modified_sparkov.csv', index_col='Unnamed: 0')

df = raw.copy(deep=True)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,unix_time,...,merch_long,is_fraud,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,2019-01-01 00:00:18,146,585,12,-0.408741,1,29.0,93.0,-0.282429,1325376018,...,25.0,0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,2019-01-01 00:00:44,51,105,2,0.233378,1,62.0,50.0,-0.293527,1325376044,...,79.0,0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,2019-01-01 00:00:51,860,366,7,0.942184,-1,69.0,95.0,-0.280243,1325376051,...,73.0,0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,2019-01-01 00:01:16,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,1325376076,...,72.0,0,52,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,2019-01-01 00:03:06,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,1325376186,...,93.0,0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


It is important things to note that the dataset is sorted **by card and chronological order**

---

### 2. Creating sequences

A sequence consists of ALL transactions belonging to a single entity, or credit card. 

To create sequences, it can be done by simply executing ```pandas.groupby()``` to group transactions by card, which in each group, it is sorted chronologically

In [2]:
CUTOFF_LENGTH = 100
labels = df['is_fraud'].to_numpy()

groups = df.groupby('cc_num')

Function definitions

In [3]:
def match_label(sequence):
    return labels[sequence[-1]]


def create_fraud_sequences(group: pd.DataFrame):
    group_ids = group.index.values.tolist()

    # Generate sequences using a sliding window
    sequences = []
    for i in range(len(group_ids)-CUTOFF_LENGTH-1):
        sequence = [group_ids[j] for j in range(i, i+CUTOFF_LENGTH)]

        if labels[sequence[-1]] == 1:
            sequences.append(sequence)
    return sequences
    
def create_non_fraud_sequences(group: pd.DataFrame):
    group_ids = group.index.values.tolist()

    # Generate sequences using a sliding window
    sequences = []
    for i in range(len(group_ids)-CUTOFF_LENGTH-1):
        sequence = [group_ids[j] for j in range(i, i+CUTOFF_LENGTH)]
        if labels[sequence[-1]] == 0:
            sequences.append(sequence)

    return sequences

def create_all_sequences(group: pd.DataFrame):
    # Order the transactions by 'unix_time'
    group_ids = group.index.values.tolist()

    # Generate sequences using a sliding window
    sequences = []
    for i in range(len(group_ids)-CUTOFF_LENGTH-1):
        sequence = [group_ids[j] for j in range(i, i+CUTOFF_LENGTH)]
        sequences.append(sequence)
    return sequences

#### 2.1. Fraudlent sequences

In [4]:
fraud = groups.apply(create_fraud_sequences)
fraud = [seq for subseqs in fraud for seq in subseqs]
np.save('../../../data/modified/sequences/fraud_sequences', fraud)
del fraud

#### 2.2. Non-fraudlent sequences

In [5]:
non_fraud = groups.apply(create_non_fraud_sequences)
non_fraud = [seq for subseqs in non_fraud for seq in subseqs]
np.save('../../../data/modified/sequences/non_fraud_sequences', non_fraud)
del non_fraud

#### 2.3. All sequences in a single file

In [6]:
all_sequences = groups.apply(create_all_sequences)
all_sequences = [seq for subseqs in all_sequences for seq in subseqs]
np.save('../../../data/modified/sequences/all_sequences', all_sequences)


match labels with all sequences:

In [7]:


sequence_labels = np.apply_along_axis(match_label, 1, all_sequences)
np.save('../../../data/modified/sequences/all_sequences_labels', sequence_labels)
del all_sequences
len(sequence_labels[sequence_labels == 1])

7885

---

### 3. Split transactions 

In [8]:
fraud_transactions = df[df['is_fraud'] == 1]
non_fraud_transactions = df[df['is_fraud'] == 0]

In [9]:
df.drop(['trans_date_trans_time', 'unix_time', 'is_fraud'],axis=1,inplace=True)
fraud_transactions.drop(['trans_date_trans_time', 'unix_time', 'is_fraud'],axis=1,inplace=True)
non_fraud_transactions.drop(['trans_date_trans_time', 'unix_time', 'is_fraud'],axis=1,inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fraud_transactions.drop(['trans_date_trans_time', 'unix_time', 'is_fraud'],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_fraud_transactions.drop(['trans_date_trans_time', 'unix_time', 'is_fraud'],axis=1,inplace=True)


Unnamed: 0,cc_num,merchant,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,age,trans_timedelta,trans_month_sin,trans_month_cos,trans_week_sin,trans_week_cos,trans_hour_sin,trans_hour_cos
0,146,585,12,-0.408741,1,29.0,93.0,-0.282429,49.0,25.0,30,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
1,51,105,2,0.233378,1,62.0,50.0,-0.293527,57.0,79.0,40,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
2,860,366,7,0.942184,-1,69.0,95.0,-0.280243,63.0,73.0,56,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
3,696,39,1,-0.157381,-1,37.0,66.0,-0.28759,7.0,72.0,52,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818
4,195,521,11,-0.17647,-1,62.0,79.0,-0.293693,58.0,93.0,32,-0.634966,-2.500818,1.529069,1.250178,0.671264,0.197699,1.433818


In [10]:
np.save('../../../data/modified/transactions/all_transactions', df.to_numpy())
np.save('../../../data/modified/transactions/fraud_transactions', fraud_transactions)
np.save('../../../data/modified/transactions/non_fraud_transactions', non_fraud_transactions)
np.save('../../../data/modified/transactions/all_transaction_labels', labels)
