# small transaction dataset

In [1]:
import pandas as pd

account_filename = 'data/accountData.csv'
customer_filename = 'data/customerData.csv'
transaction_filename = 'data/transactionData.csv'
labels_filename = 'data/labelledData_15.csv'

In [2]:
accounts = pd.read_csv(account_filename)

In [3]:
customers = pd.read_csv(customer_filename)

In [4]:
transactions = pd.read_csv(transaction_filename)

In [5]:
labels = pd.read_csv(labels_filename)

In [6]:
n = len(labels)

In [7]:
fraction = int(n / 10)

In [8]:
print(n)
print(fraction)

15772235
1577223


In [9]:
import random

In [10]:
transactions['Label'] = labels['Class']

In [11]:
samples = random.sample(range(n), fraction)

In [12]:
transactions_small = transactions.sample(frac=0.1, random_state = 256)

In [13]:
print(len(transactions_small))

1577224


In [14]:
sum(transactions_small['Label'])

605

In [15]:
transactions_small.to_csv('data/transactions_small.csv')

# Cleaned small transaction dataset

In [16]:
del transactions_small['TRANSACTION_ID']
del transactions_small['BENEFICIARY_CUSTOMER_ID'] 
del transactions_small['BENEFICIARY_ACCOUNT_ID']

In [17]:
transactions_small['ORIGIN_CUSTOMER_ID'].fillna(2.5e+05, inplace=True)
transactions_small['ORIGIN_ACCOUNT_ID'].fillna(5.317910e+05, inplace=True)

In [18]:
transactions_small['TRANSACTION_SOURCE'] = transactions_small['TRANSACTION_SOURCE'].astype('category').cat.codes
transactions_small['TRANSACTION_TYPE'] = transactions_small['TRANSACTION_TYPE'].astype('category').cat.codes

In [19]:
transactions_small = transactions_small.merge(customers, left_on='ORIGIN_CUSTOMER_ID', right_on='CUSTOMER_ID', how='left')

In [20]:
transactions_small['CUSTOMER_RISK_SCORE'].fillna(transactions_small['CUSTOMER_RISK_SCORE'].mean(), inplace=True)

In [21]:
transactions_small['CUSTOMER_TYPE'].fillna('unknown', inplace=True)
transactions_small['COUNTRY'].fillna('unknown', inplace=True)

In [22]:
transactions_small['CUSTOMER_TYPE'] = transactions_small['CUSTOMER_TYPE'].astype('category').cat.codes
transactions_small['COUNTRY'] = transactions_small['COUNTRY'].astype('category').cat.codes

In [23]:
transactions_small['TRANSACTION_DATE_TIME'] = pd.to_datetime(transactions_small['TRANSACTION_DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

In [24]:
transactions_small['Day'] = transactions_small['TRANSACTION_DATE_TIME'].dt.strftime('%d').astype('float64')
transactions_small['Hour'] = transactions_small['TRANSACTION_DATE_TIME'].dt.strftime('%H').astype('float64')
transactions_small['Minute'] = transactions_small['TRANSACTION_DATE_TIME'].dt.strftime('%M').astype('float64')

In [25]:
del transactions_small['TRANSACTION_DATE_TIME']

In [26]:
del transactions_small['CUSTOMER_ID']

In [27]:
len(transactions_small.columns)

12

In [28]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
transactions_small[['TRANSACTION_AMOUNT']] = scaler.fit_transform(transactions_small[['TRANSACTION_AMOUNT']])
transactions_small[['TRANSACTION_TYPE']] = scaler.fit_transform(transactions_small[['TRANSACTION_TYPE']])
transactions_small[['TRANSACTION_SOURCE']] = scaler.fit_transform(transactions_small[['TRANSACTION_SOURCE']])
transactions_small[['ORIGIN_CUSTOMER_ID']] = scaler.fit_transform(transactions_small[['ORIGIN_CUSTOMER_ID']])
transactions_small[['ORIGIN_ACCOUNT_ID']] = scaler.fit_transform(transactions_small[['ORIGIN_ACCOUNT_ID']])
transactions_small[['CUSTOMER_RISK_SCORE']] = scaler.fit_transform(transactions_small[['CUSTOMER_RISK_SCORE']])
transactions_small[['CUSTOMER_TYPE']] = scaler.fit_transform(transactions_small[['CUSTOMER_TYPE']])
transactions_small[['COUNTRY']] = scaler.fit_transform(transactions_small[['COUNTRY']])


transactions_small[['Day']] = scaler.fit_transform(transactions_small[['Day']])
transactions_small[['Hour']] = scaler.fit_transform(transactions_small[['Hour']])
transactions_small[['Minute']] = scaler.fit_transform(transactions_small[['Minute']])

In [29]:
transactions_small.to_csv('data/transactions_small_clean.csv')