In [1]:
import re
import os
import time
import pickle
import sklearn
#import cx_Oracle
import numpy as np
import pandas as pd
import seaborn as sns

import scipy.sparse
from tqdm import tqdm, tqdm_notebook
from sklearn import datasets
from collections import defaultdict
from matplotlib import pyplot as plt

from scipy.sparse import csr_matrix, coo_matrix, csc_matrix


# oracle
#oracle_user = "ro_user"
#oracle_pass = "ro_user"
#oracle_scheme = "aml_evraz"
#oracle_host = "192.168.101.13/rnd"
#oracle_connection = cx_Oracle.connect(oracle_user, oracle_pass, oracle_host, encoding = "UTF-8", nencoding = "UTF-8")
#oracle_cursor = oracle_connection.cursor()

### PUT YOUR PATH HERE (mine default is home/username/notebooks)
path_to_data = '../../shared_files/'

%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(12,8)})

In [2]:
class Timer:
    def __init__(self, msg='operation', verbose=True):
        self.msg = msg
        self.verbose = verbose
    def __enter__(self, ):
        self.start = time.clock()
        return self

    def __exit__(self, *args):
        self.end = time.clock()
        self.interval = self.end - self.start
        if self.verbose:
            print('{} took {:.3f}s'.format(self.msg, self.interval), flush=True)


def fill_zeros_with_last(arr):
    prev = np.arange(len(arr))
    prev[arr == 0] = 0
    prev = np.maximum.accumulate(prev)
    return arr[prev]

In [3]:
#%reset_selective -f graph_trans_df

#with Timer('read trans graph frame'):
    #graph_trans_df = pd.read_csv('graph_trans_df.csv')
import pickle

with open('graph_trans_df.pkl', 'rb') as handle:
    graph_trans_df = pickle.load(handle)

In [4]:
graph_trans_df.head(3)

P_CLIENTROLE,all,all,1,2,5,all,all,all,all
columns,ID,P_BASEAMOUNT,P_CLIENTID,P_CLIENTID,P_CLIENTID,P_EKNPCODE,P_OPERATIONDATETIME,seconds_from_start,target
0,0,151075.01,0.0,1.0,,423,2017-01-01,0.0,0
1,1,136206.83,2.0,1.0,,423,2017-01-01,0.0,0
2,2,59613.06,3.0,1.0,,421,2017-01-01,0.0,0


### Transactions are quite sparse through time, so we can aggregate through seconds checkpoints

In [5]:
graph_trans_df.columns

MultiIndex(levels=[[1, 2, 5, 'all'], ['ID', 'P_BANK', 'P_BANKCITY', 'P_BANKCOUNTRYCODE', 'P_BANKNAME', 'P_BANK_CLIENT', 'P_BASEAMOUNT', 'P_BRANCH', 'P_BS_OPER_TYPE', 'P_CLIENTID', 'P_CLIENT_TYPE', 'P_COUNTRYCODE', 'P_CURRENCYCODE', 'P_DOCCATEGORY', 'P_EKNPCODE', 'P_ISSUEDBID', 'P_KFM_OPER_REASON', 'P_OPERATIONDATETIME', 'P_ORGFORM', 'P_SDP', 'P_TOEXTRACTBOOL', 'P_WAS_SEND', 'acc_persistence', 'seconds_from_last_client_op', 'seconds_from_start', 'target']],
           labels=[[3, 3, 0, 1, 2, 3, 3, 3, 3], [0, 6, 9, 9, 9, 14, 17, 24, 25]],
           names=['P_CLIENTROLE', 'columns'])

In [6]:
seconds_passed_counter = defaultdict(lambda: len(seconds_passed_counter))
sorted_seconds_uniqs = np.sort(graph_trans_df['all', 'seconds_from_start'].unique(), kind='mergesort').astype(int)
for el in sorted_seconds_uniqs:
    seconds_passed_counter[int(el)]
inv_seconds_passed_counter = {v:k for k, v in seconds_passed_counter.items()}
print('There are {} checkpoints for {} seconds'.format(len(seconds_passed_counter), int(graph_trans_df['all', 'seconds_from_start'].max())))

There are 1188974 checkpoints for 7776000 seconds


In [7]:
sec_to_id_map_array = np.zeros(int(graph_trans_df['all', 'seconds_from_start'].max()) + 1)
sec_to_id_map_array[sorted_seconds_uniqs] = np.arange(len(seconds_passed_counter))
sec_to_id_map_array = fill_zeros_with_last(sec_to_id_map_array).astype(int)

In [8]:
sec_to_op_index = np.zeros(len(graph_trans_df), dtype=int)
sec_uniqs, sec_op_ids = np.unique(graph_trans_df['all', 'seconds_from_start'].values.astype(int), return_index=True)

In [9]:
sec_to_op_index[sec_uniqs] = sec_op_ids
sec_to_op_index = fill_zeros_with_last(sec_to_op_index).astype(int)

### Find number of clients

In [10]:
client_ids = np.stack([graph_trans_df[role, colname].values for (role, colname) in zip([1, 2, 5], ['P_CLIENTID'] * 3)], axis=1)
n_clients = int(client_ids.reshape(-1)[np.isfinite(client_ids.reshape(-1))].max()) + 1
print('number of clients = {}'.format(n_clients))

number of clients = 1016828


In [11]:
role_to_slice, column_to_slice = 'all', 'P_EKNPCODE'

In [12]:
graph_trans_df.loc[:, (role_to_slice, column_to_slice)] = graph_trans_df[role_to_slice, column_to_slice]\
                                                            .map(str)\
                                                            .map(lambda s: s.replace('З', '3'))\
                                                            .map(float)\
                                                            .map(int)

In [13]:
eknp_encoder = {v: i for v, i in zip(graph_trans_df[role_to_slice, column_to_slice].unique(), np.arange(1, len(graph_trans_df) + 1))}
epnp_decoder = {i: v for v, i in eknp_encoder.items()}

In [14]:
eknp_uniqs = np.sort(np.unique(graph_trans_df[role_to_slice, column_to_slice]))
eknp_uniqs

array([-1000,     0,     2,     3,     4,     8,     9,    10,    11,
          12,    13,    14,    15,    16,    17,    19,    20,    21,
          22,    26,    27,    30,    31,    32,    33,    34,    35,
          36,    37,    38,    40,    42,    44,    46,    48,    51,
          53,    54,    56,    57,    58,    59,    67,    68,    71,
          72,    73,    74,    75,    77,    81,    84,    87,    90,
          91,    92,    95,    96,    99,   111,   112,   119,   120,
         122,   131,   132,   140,   150,   160,   171,   172,   180,
         181,   182,   190,   211,   213,   219,   221,   223,   230,
         290,   311,   312,   314,   315,   316,   317,   318,   319,
         321,   322,   324,   325,   326,   329,   331,   332,   333,
         334,   341,   342,   343,   344,   345,   346,   350,   361,
         362,   390,   411,   413,   419,   420,   421,   423,   424,
         429,   430,   490,   510,   520,   522,   529,   539,   541,
         545,   549,

In [15]:
income_trans = graph_trans_df[~graph_trans_df[2, 'P_CLIENTID'].isnull()]

income_codes_matrix = coo_matrix((income_trans['all', 'P_EKNPCODE'].map(eknp_encoder.__getitem__),
                                    (income_trans['all', 'ID'], income_trans[2, 'P_CLIENTID'].values.astype(int))),
                                   shape=(len(graph_trans_df), n_clients),
                                   dtype=np.int)

income_codes_csr, income_codes_csc = income_codes_matrix.tocsr(), income_codes_matrix.tocsc()

outcome_trans = graph_trans_df[~graph_trans_df[1, 'P_CLIENTID'].isnull()]

outcome_codes_matrix = coo_matrix((outcome_trans['all', 'P_EKNPCODE'].map(eknp_encoder.__getitem__),
                                    (outcome_trans['all', 'ID'], outcome_trans[1, 'P_CLIENTID'].values.astype(int))),
                                   shape=(len(graph_trans_df), n_clients),
                                   dtype=np.int)

outcome_codes_csr, outcome_codes_csc = outcome_codes_matrix.tocsr(), outcome_codes_matrix.tocsc()

In [16]:
def get_incoming_window_links_by_eknp_code(second_clientid_opid, backward=86400*7):
    global eknp_uniqs
    global sec_to_op_index
    global income_codes_csc
    second, client_id, op_id = second_clientid_opid
    sec_back = max(second - backward, 0)
    i_from = sec_to_op_index[sec_back]
    #i_to = sec_to_op_index[second]
    i_to = op_id
    kek = np.zeros(len(eknp_uniqs) + 1, dtype=int)
    kek[0] = op_id
    if i_from > i_to:
        return kek
    sliced = income_codes_csc[i_from:i_to, client_id]
    if sliced.nnz == 0:
        return kek
    for eknp_code_encoded, count in zip(*np.unique(sliced.data, return_counts=True)):
        kek[eknp_code_encoded] = count
    return kek

def get_outcoming_window_links_by_eknp_code(second_clientid_opid, backward=86400*7):
    global eknp_uniqs
    global sec_to_op_index
    global outcome_codes_csc
    second, client_id, op_id = second_clientid_opid
    sec_back = max(second - backward, 0)
    i_from = sec_to_op_index[sec_back]
    #i_to = sec_to_op_index[second]
    i_to = op_id
    kek = np.zeros(len(eknp_uniqs) + 1, dtype=int)
    kek[0] = op_id
    if i_from > i_to:
        return kek
    sliced = outcome_codes_csc[i_from:i_to, client_id]
    if sliced.nnz == 0:
        return kek
    for eknp_code_encoded, count in zip(*np.unique(sliced.data, return_counts=True)):
        kek[eknp_code_encoded] = count
    return kek

In [17]:
from multiprocessing import Pool
#raw_id_source_role
#raw_id_target_role
n_proc = 32

In [18]:
if_income  = ~graph_trans_df[2, 'P_CLIENTID'].isnull()
tqdm_max_ = if_income.sum()

seconds_input = graph_trans_df['all', 'seconds_from_start'][if_income].values.astype(int)
indices_input = graph_trans_df[2, 'P_CLIENTID'][if_income].values.astype(int)
op_indices_input = graph_trans_df['all', 'ID'][if_income].values.astype(int)

with Pool(processes=n_proc) as p:
    ress_input_target = np.array(list(tqdm_notebook(p.imap_unordered(get_incoming_window_links_by_eknp_code, np.c_[seconds_input, indices_input, op_indices_input]), total=tqdm_max_)), dtype=int)
ress_input_target = ress_input_target[ress_input_target[:, 0].argsort(kind='mergesort')]




In [19]:
with Pool(processes=n_proc) as p:
    ress_output_target = np.array(list(tqdm_notebook(p.imap_unordered(get_outcoming_window_links_by_eknp_code, np.c_[seconds_input, indices_input, op_indices_input]), total=tqdm_max_)), dtype=int)
ress_output_target = ress_output_target[ress_output_target[:, 0].argsort(kind='mergesort')]




In [20]:
if_outcome  = ~graph_trans_df[1, 'P_CLIENTID'].isnull()
tqdm_max_ = if_outcome.sum()

seconds_output = graph_trans_df['all', 'seconds_from_start'][if_outcome].values.astype(int)
indices_output = graph_trans_df[1, 'P_CLIENTID'][if_outcome].values.astype(int)
op_indices_output = graph_trans_df['all', 'ID'][if_outcome].values.astype(int)

with Pool(processes=n_proc) as p:
    ress_input_source = np.array(list(tqdm_notebook(p.imap_unordered(get_incoming_window_links_by_eknp_code, np.c_[seconds_output, indices_output, op_indices_output]), total=tqdm_max_)), dtype=int)
ress_input_source = ress_input_source[ress_input_source[:, 0].argsort(kind='mergesort')]




In [21]:
with Pool(processes=n_proc) as p:
    ress_output_source = np.array(list(tqdm_notebook(p.imap_unordered(get_outcoming_window_links_by_eknp_code, np.c_[seconds_output, indices_output, op_indices_output]), total=tqdm_max_)), dtype=int)
ress_output_source = ress_output_source[ress_output_source[:, 0].argsort(kind='mergesort')]




In [53]:
ress_to_pickle = {
    'ress_input_target': ress_input_target,
    'ress_output_target': ress_output_target,
    'ress_input_source': ress_input_source,
    'ress_output_source': ress_output_source
}

In [None]:
with open('ress_dict.pkl', 'wb') as handler:
    pickle.dump(ress_to_pickle, handler, protocol=pickle.HIGHEST_PROTOCOL)

1


In [27]:
print(1)

1


In [106]:
kek = income_codes_csc[0:86400, 1]

In [108]:
np.unique(kek.data, return_counts=True)

(array([ 1,  2, 19]), array([26, 13,  3]))

In [112]:
for i, j in zip(*np.unique(np.array([]), return_counts=True)):
    print(i, j)

In [95]:
income_codes_csc[:, 1]

<8216585x1 sparse matrix of type '<class 'numpy.float128'>'
	with 1319825 stored elements in Compressed Sparse Column format>

In [11]:
%reset_selective -f income_trans
%reset_selective -f income_amount_matrix
%reset_selective -f income_amount_csr
%reset_selective -f income_amount_csc

%reset_selective -f outcome_trans
%reset_selective -f outcome_amount_matrix
%reset_selective -f outcome_amount_csr
%reset_selective -f outcome_amount_csc

income_trans = graph_trans_df[~graph_trans_df.id_receiver.isnull()]

income_amount_matrix = coo_matrix((income_trans.P_BASEAMOUNT,
                            (sec_to_id_map_array[income_trans.seconds_from_start.values.astype(int)],
                             income_trans.id_receiver.values.astype(int))),
                           shape=(len(seconds_passed_counter), n_clients),
                           dtype=np.float128)

income_amount_csr, income_amount_csc = income_amount_matrix.tocsr(), income_amount_matrix.tocsc()

outcome_trans = graph_trans_df[~graph_trans_df.id_sender.isnull()]

outcome_amount_matrix = coo_matrix((outcome_trans.P_BASEAMOUNT,
                            (sec_to_id_map_array[outcome_trans.seconds_from_start.values.astype(int)],
                             outcome_trans.id_sender.values.astype(int))),
                            shape=(len(seconds_passed_counter), n_clients),
                            dtype=np.float128)

outcome_amount_csr, outcome_amount_csc = outcome_amount_matrix.tocsr(), outcome_amount_matrix.tocsc()

### Build Income and outcome matrices of shape [n_trans, n_clients], from which we can find time-aggregated graph properties

In [12]:
%reset_selective -f income_trans
%reset_selective -f income_indices_matrix
%reset_selective -f income_indices_csr
%reset_selective -f income_indices_csc

%reset_selective -f outcome_trans
%reset_selective -f outcome_indices_matrix
%reset_selective -f outcome_indices_csr
%reset_selective -f outcome_indices_csc

income_trans = graph_trans_df[~graph_trans_df.id_receiver.isnull()]

income_indices_matrix = coo_matrix((income_trans.id_sender.values.astype(int) + 1,
                                    (income_trans.ID, income_trans.id_receiver.values.astype(int))),
                                   shape=(len(graph_trans_df), n_clients),
                                   dtype=np.float128)

income_indices_csr, income_indices_csc = income_indices_matrix.tocsr(), income_indices_matrix.tocsc()

outcome_trans = graph_trans_df[~graph_trans_df.id_sender.isnull()]

outcome_indices_matrix = coo_matrix((outcome_trans.id_receiver.values.astype(int) + 1,
                                    (outcome_trans.ID, outcome_trans.id_sender.values.astype(int))),
                                   shape=(len(graph_trans_df), n_clients),
                                   dtype=np.float128)

outcome_indices_csr, outcome_indices_csc = outcome_indices_matrix.tocsr(), outcome_indices_matrix.tocsc()


In [13]:
def get_incoming_window_links(second_clientid_opid, backward=86400*7):
    global sec_to_op_index
    global income_indices_csc
    second, client_id, op_id = second_clientid_opid
    sec_back = max(second - backward, 0)
    i_from = sec_to_op_index[sec_back]
    #i_to = sec_to_op_index[second]
    i_to = op_id
    if i_from > i_to:
        return op_id, 0
    sliced = income_indices_csc[i_from:i_to, client_id]
    return op_id, len(np.unique(sliced.data))

def get_outcoming_window_links(second_clientid_opid, backward=86400*7):
    global sec_to_op_index
    global outcome_indices_csc
    second, client_id, op_id = second_clientid_opid
    sec_back = max(second - backward, 0)
    i_from = sec_to_op_index[sec_back]
    #i_to = sec_to_op_index[second]
    i_to = op_id
    if i_from > i_to:
        return op_id, 0
    sliced = outcome_indices_csc[i_from:i_to, client_id]
    return op_id, len(np.unique(sliced.data))

In [14]:
from multiprocessing import Pool
#raw_id_source_role
#raw_id_target_role

In [18]:
if_income  = ~graph_trans_df.id_receiver.isnull()
tqdm_max_ = if_income.sum()

seconds_input = graph_trans_df.seconds_from_start[if_income].values.astype(int)
indices_input = graph_trans_df.id_receiver[if_income].values.astype(int)
op_indices_input = graph_trans_df[if_income].ID.values.astype(int)

with Pool(processes=24) as p:
    ress_input_target = np.array(list(tqdm_notebook(p.imap_unordered(get_incoming_window_links, np.c_[seconds_input, indices_input, op_indices_input]), total=tqdm_max_)), dtype=int)
ress_input_target = ress_input_target[ress_input_target[:, 0].argsort(kind='mergesort')]




In [19]:
%reset_selective -f ress_output_target

with Pool(processes=24) as p:
    ress_output_target = np.array(list(tqdm_notebook(p.imap_unordered(get_outcoming_window_links, np.c_[seconds_input, indices_input, op_indices_input]), total=tqdm_max_)), dtype=int)
ress_output_target = ress_output_target[ress_output_target[:, 0].argsort(kind='mergesort')]




In [20]:
%reset_selective -f if_outcome
%reset_selective -f seconds_output
%reset_selective -f indices_output
%reset_selective -f op_indices_output
%reset_selective -f ress_input_sender

if_outcome  = ~graph_trans_df.id_sender.isnull()
tqdm_max_ = if_outcome.sum()

seconds_output = graph_trans_df.seconds_from_start[if_outcome].values.astype(int)
indices_output = graph_trans_df.id_sender[if_outcome].values.astype(int)
op_indices_output = graph_trans_df[if_outcome].ID.values.astype(int)

with Pool(processes=24) as p:
    ress_input_source = np.array(list(tqdm_notebook(p.imap_unordered(get_incoming_window_links, np.c_[seconds_output, indices_output, op_indices_output]), total=tqdm_max_)), dtype=int)
ress_input_source = ress_input_source[ress_input_source[:, 0].argsort(kind='mergesort')]

100%|██████████| 7126911/7126911 [16:51<00:00, 7046.74it/s]


In [21]:
with Pool(processes=24) as p:
    ress_output_source = np.array(list(tqdm_notebook(p.imap_unordered(get_outcoming_window_links, np.c_[seconds_output, indices_output, op_indices_output]), total=tqdm_max_)), dtype=int)
ress_output_source = ress_output_source[ress_output_source[:, 0].argsort(kind='mergesort')]

100%|██████████| 7126911/7126911 [09:56<00:00, 11947.02it/s]


In [22]:
links_week_info = pd.DataFrame.from_dict({
    'ID': graph_trans_df.ID,
    'links_input_receiver': np.zeros(len(graph_trans_df)),
    'links_output_receiver': np.zeros(len(graph_trans_df)),
    'links_input_sender': np.zeros(len(graph_trans_df)),
    'links_output_sender': np.zeros(len(graph_trans_df))
})

In [25]:
links_week_info.loc[ress_input_target[:, 0], 'links_input_receiver'] = ress_input_target[:, 1]
links_week_info.loc[ress_output_target[:, 0], 'links_output_receiver'] = ress_output_target[:, 1]
links_week_info.loc[ress_input_source[:, 0], 'links_input_sender'] = ress_input_source[:, 1]
links_week_info.loc[ress_output_source[:, 0], 'links_output_sender'] = ress_output_source[:, 1]

In [30]:
#links_week_info.to_csv('links_week_info.csv', index=False)

In [31]:
graph_trans_df.columns

Index(['ID', 'P_BASEAMOUNT', 'id_sender', 'id_receiver', 'id_benef',
       'P_EKNPCODE', 'P_OPERATIONDATETIME', 'seconds_from_start', 'target'],
      dtype='object')

In [36]:
(links_week_info.links_output_sender == 0).mean()

0.29086840822555843

In [40]:
kek = coo_matrix((np.array([]), (np.array([]), np.array([]))), shape=(100, 100))

In [41]:
kek

<100x100 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in COOrdinate format>

In [42]:
graph_trans_df

Unnamed: 0,ID,P_BASEAMOUNT,id_sender,id_receiver,id_benef,P_EKNPCODE,P_OPERATIONDATETIME,seconds_from_start,target
0,0,151075.01,0.0,1.0,,423,2017-01-01 00:00:00,0.0,0
1,1,136206.83,2.0,1.0,,423,2017-01-01 00:00:00,0.0,0
2,2,59613.06,3.0,1.0,,421,2017-01-01 00:00:00,0.0,0
3,3,29838.62,4.0,1.0,,423,2017-01-01 00:00:00,0.0,0
4,4,32000.00,,5.0,,-1000,2017-01-01 02:10:00,7800.0,-1
5,5,100000.00,,6.0,,-1000,2017-01-01 10:58:00,39480.0,-1
6,6,100000.00,,7.0,,-1000,2017-01-01 11:05:00,39900.0,-1
7,7,200000.00,,8.0,,-1000,2017-01-01 11:56:00,42960.0,-1
8,8,60000.00,,8.0,,-1000,2017-01-01 11:58:00,43080.0,-1
9,9,10000.00,,8.0,,-1000,2017-01-01 12:02:00,43320.0,-1


In [None]:
slice_feature = 'P_EKNPCODE'
target_feature = 'P_BASEAMOUNT'