# small transaction dataset

In [1]:
import pandas as pd

account_filename = 'data/accountData.csv'
customer_filename = 'data/customerData.csv'
transaction_filename = 'data/transactionData.csv'
labels_filename = 'data/labelledData_15.csv'

In [None]:
accounts = pd.read_csv(account_filename)

In [13]:
customers = pd.read_csv(customer_filename)

In [2]:
transactions = pd.read_csv(transaction_filename)

In [3]:
labels = pd.read_csv(labels_filename)

In [4]:
transactions['Label'] = labels['Class']

In [5]:
transactions_small = transactions.sample(frac=0.01, random_state = 256)

In [6]:
print(len(transactions_small))

157722


In [7]:
sum(transactions_small['Label'])

63

In [8]:
transactions_small.to_csv('data/transactions_small_1p.csv')

# Cleaned small transaction dataset

In [9]:
del transactions_small['TRANSACTION_ID']
del transactions_small['BENEFICIARY_CUSTOMER_ID'] 
del transactions_small['BENEFICIARY_ACCOUNT_ID']

In [10]:
transactions_small['ORIGIN_CUSTOMER_ID'].fillna(2.5e+05, inplace=True)
transactions_small['ORIGIN_ACCOUNT_ID'].fillna(5.317910e+05, inplace=True)

In [11]:
transactions_small['TRANSACTION_SOURCE'] = transactions_small['TRANSACTION_SOURCE'].astype('category').cat.codes
transactions_small['TRANSACTION_TYPE'] = transactions_small['TRANSACTION_TYPE'].astype('category').cat.codes

In [14]:
transactions_small = transactions_small.merge(customers, left_on='ORIGIN_CUSTOMER_ID', right_on='CUSTOMER_ID', how='left')

In [15]:
transactions_small['CUSTOMER_RISK_SCORE'].fillna(transactions_small['CUSTOMER_RISK_SCORE'].mean(), inplace=True)

In [16]:
transactions_small['CUSTOMER_TYPE'].fillna('unknown', inplace=True)
transactions_small['COUNTRY'].fillna('unknown', inplace=True)

In [17]:
transactions_small['CUSTOMER_TYPE'] = transactions_small['CUSTOMER_TYPE'].astype('category').cat.codes
transactions_small['COUNTRY'] = transactions_small['COUNTRY'].astype('category').cat.codes

In [18]:
transactions_small['TRANSACTION_DATE_TIME'] = pd.to_datetime(transactions_small['TRANSACTION_DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

In [19]:
transactions_small['Day'] = transactions_small['TRANSACTION_DATE_TIME'].dt.strftime('%d').astype('float64')
transactions_small['Hour'] = transactions_small['TRANSACTION_DATE_TIME'].dt.strftime('%H').astype('float64')
transactions_small['Minute'] = transactions_small['TRANSACTION_DATE_TIME'].dt.strftime('%M').astype('float64')

In [20]:
del transactions_small['TRANSACTION_DATE_TIME']

In [21]:
del transactions_small['CUSTOMER_ID']

In [22]:
len(transactions_small.columns)

12

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
transactions_small[['TRANSACTION_AMOUNT']] = scaler.fit_transform(transactions_small[['TRANSACTION_AMOUNT']])
transactions_small[['TRANSACTION_TYPE']] = scaler.fit_transform(transactions_small[['TRANSACTION_TYPE']])
transactions_small[['TRANSACTION_SOURCE']] = scaler.fit_transform(transactions_small[['TRANSACTION_SOURCE']])
transactions_small[['ORIGIN_CUSTOMER_ID']] = scaler.fit_transform(transactions_small[['ORIGIN_CUSTOMER_ID']])
transactions_small[['ORIGIN_ACCOUNT_ID']] = scaler.fit_transform(transactions_small[['ORIGIN_ACCOUNT_ID']])
transactions_small[['CUSTOMER_RISK_SCORE']] = scaler.fit_transform(transactions_small[['CUSTOMER_RISK_SCORE']])
transactions_small[['CUSTOMER_TYPE']] = scaler.fit_transform(transactions_small[['CUSTOMER_TYPE']])
transactions_small[['COUNTRY']] = scaler.fit_transform(transactions_small[['COUNTRY']])


transactions_small[['Day']] = scaler.fit_transform(transactions_small[['Day']])
transactions_small[['Hour']] = scaler.fit_transform(transactions_small[['Hour']])
transactions_small[['Minute']] = scaler.fit_transform(transactions_small[['Minute']])

In [24]:
transactions_small.to_csv('data/transactions_small_clean_1p.csv')