In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%pylab inline

account_filename = 'data/accountData.csv'
customer_filename = 'data/customerData.csv'
transaction_filename = 'data/transactionData.csv'
sanctions_filename = 'data/ctry_sanctions_list.csv'

Populating the interactive namespace from numpy and matplotlib


In [2]:
accounts = pd.read_csv(account_filename)
customers = pd.read_csv(customer_filename)
sanctions = pd.read_csv(sanctions_filename)

In [3]:
transactions = pd.read_csv(transaction_filename)

# Merge all three files

In [4]:
labels = pd.read_csv('data/labelledData_15.csv')

In [5]:
#del transactions['Unnamed: 0']
transactions['Label'] = labels['Class']

In [6]:
transactions.columns

Index(['TRANSACTION_ID', 'ORIGIN_CUSTOMER_ID', 'ORIGIN_ACCOUNT_ID',
       'TRANSACTION_DATE_TIME', 'TRANSACTION_TYPE', 'TRANSACTION_SOURCE',
       'TRANSACTION_AMOUNT', 'BENEFICIARY_CUSTOMER_ID',
       'BENEFICIARY_ACCOUNT_ID', 'Label'],
      dtype='object')

In [7]:
# transactions['TRANSACTION_TYPE'].hist()

In [8]:
# if customer, account id is NaN, then use beneficial ids

In [9]:
del transactions['BENEFICIARY_CUSTOMER_ID']
del transactions['BENEFICIARY_ACCOUNT_ID']

In [10]:
transactions.dropna(inplace=True)

In [11]:
transactions = transactions.merge(customers, left_on = 'ORIGIN_CUSTOMER_ID', right_on = 'CUSTOMER_ID')

In [None]:
accounts.head(1)

In [12]:
transactions = transactions.merge(accounts, left_on = 'ORIGIN_ACCOUNT_ID', right_on = 'ACCOUNT_ID')

In [None]:
transactions.columns

In [13]:
transactions.drop([
                   'ORIGIN_CUSTOMER_ID',
                   'ORIGIN_ACCOUNT_ID',
                   'TRANSACTION_DATE_TIME',
                   'CUSTOMER_ID_x',
                   'ACCOUNT_ID',
                   'CUSTOMER_ID_y',
                   'ACCOUNT_DATE_TIME'], axis = 1, inplace=True)

In [None]:
transactions.columns

In [14]:
categorical_features = ['TRANSACTION_TYPE', 
                        'TRANSACTION_SOURCE',
                        'CUSTOMER_TYPE', 
                        'COUNTRY',
                        'ACCOUNT_TYPE']

In [15]:
for feature in categorical_features:
    transactions[feature] = transactions[feature].astype('category')

In [16]:
skewed_numerical_features = ['TRANSACTION_AMOUNT',
                             'ACCOUNT_BALANCE']

In [17]:
transactions[skewed_numerical_features] = transactions[skewed_numerical_features].apply(lambda x: np.log(x + 1))

In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_features = ['TRANSACTION_AMOUNT',
                      'CUSTOMER_RISK_SCORE',
                      'ACCOUNT_BALANCE']
transactions[numerical_features] = scaler.fit_transform(transactions[numerical_features])

In [19]:
sanctions_list = list(sanctions['Sanctions, Countries'])

In [20]:
transactions['Sanction'] = (transactions['COUNTRY'].apply(lambda ctry: ctry in sanctions_list)).astype(int)

In [21]:
del transactions['COUNTRY']

In [22]:
transactions.columns

Index(['TRANSACTION_ID', 'TRANSACTION_TYPE', 'TRANSACTION_SOURCE',
       'TRANSACTION_AMOUNT', 'Label', 'CUSTOMER_TYPE', 'CUSTOMER_RISK_SCORE',
       'ACCOUNT_TYPE', 'ACCOUNT_BALANCE', 'Sanction'],
      dtype='object')

In [23]:
transactions = pd.get_dummies(transactions)

In [24]:
len(transactions.columns)

23

In [None]:
labels = transactions['Label']

In [25]:
transactions.to_csv('data/transactions_all_features.csv')

# Data Exploration

In [None]:
transactions.columns

In [None]:
small_set = transactions.sample(frac=0.1)

In [None]:
corrs = small_set.corr()

In [None]:
#pd.plotting.scatter_matrix(small_set, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
corrs.sort_values('Label', ascending = False);
reduced_features = list(corrs[corrs['Label'] > 0].index)
print(reduced_features)

In [None]:
print(len(list(transactions.columns)))
print(list(transactions.columns))
feature = list(transactions.columns)[8]
print(feature)

In [None]:
# transactions.groupby('Label')[feature].hist();
transactions.hist(feature, by='Label');

In [None]:
transactions = transactions[reduced_features]

In [None]:
transactions.head(1)

In [None]:
transactions.to_csv('data/reduced_transactions_1p.csv')

# Gaussian Mixture

In [None]:
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualizations code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [None]:
from sklearn.mixture import GaussianMixture

clusterer = GaussianMixture(n_components=1).fit(transactions)

preds = clusterer.predict(transactions)

centers = clusterer.means_