In [2]:
import re
import os
import time
import pickle
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns

import scipy.sparse
from tqdm import tqdm, tqdm_notebook
from collections import defaultdict, OrderedDict
from matplotlib import pyplot as plt

from scipy.sparse import csr_matrix, coo_matrix, csc_matrix

### PUT YOUR PATH HERE (mine default is home/username/notebooks)
path_to_data = '/home/shared_files/'

%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(12,8)})

### Helper functions

In [3]:
class Timer:
    def __init__(self, msg='operation', verbose=True):
        self.msg = msg
        self.verbose = verbose       
    def __enter__(self, ):
        self.start = time.clock()
        return self

    def __exit__(self, *args):
        self.end = time.clock()
        self.interval = self.end - self.start
        if self.verbose:
            print('{} took {:.3f}s'.format(self.msg, self.interval), flush=True)


def df_categorical_variables_stat(df, max_cardinality=200):
    for colname in df.columns:
        print('next_column: {}'.format(colname))
        uniq_vals = df[colname].unique()
        nunique = len(uniq_vals)
        print("# of uniqs: {}".format(nunique))
        if nunique < max_cardinality:
            print('Viable Categorical. Value counts:')
            print(df[colname].value_counts(dropna=False))
        else:
            print('High cardinality/Non categorical')
        print((('-' * 80) + '\n')*3)


def process_client_indices(client_indices, trivial_to_nontrivial=None, default_null=-1000):
    if trivial_to_nontrivial is None:
        trivial_ids_to_nontrivial = defaultdict(lambda: -len(trivial_ids_to_nontrivial) - 2)

    nontrivial_clients_ids = client_indices.copy()
    nontrivial_clients_ids[nontrivial_clients_ids.isnull()] = default_null

    nontrivial_clients_ids = nontrivial_clients_ids.map(str)

    nontrivial_ids = nontrivial_clients_ids.map(lambda s: s.strip())

    is_trivial = nontrivial_ids.map(lambda s: len(re.findall('^([\d]+|-[\d]+)', s)) == 0)
    trivial_ids = nontrivial_ids[is_trivial]

    for ti in trivial_ids:
        trivial_ids_to_nontrivial[ti]
    trivial_ids_to_nontrivial
    nontrivial_ids[is_trivial] = trivial_ids.map(lambda s: str(trivial_ids_to_nontrivial[s]))

    nontrivial_ids = nontrivial_ids.map(lambda s: re.findall('^([\d]+|-[\d]+)', s)[0]).map(int)
    return nontrivial_ids, trivial_to_nontrivial


def flatten_df_by_column(df, to_flatten, flatten_by, group_by, names_flatten_by=None):
    columns_to_flatten = to_flatten
    column_to_flatten_by = flatten_by
    column_to_group_by = group_by
    if names_flatten_by is None:
        names_flatten_by = {
            uniq_val: uniq_val
            for uniq_val in df[column_to_flatten_by].unique()
        }
    if not isinstance(columns_to_flatten, (list, tuple, pd.Index)):
        columns_to_flatten = [to_flatten]
    frames_to_join = [
        df[[column_to_group_by] + columns_to_flatten][df[column_to_flatten_by] == uniq_val]
        for uniq_val in df[column_to_flatten_by].unique()
    ]

    for frame, uniq_val in zip(frames_to_join, df[flatten_by].unique()):
        frame.set_index(column_to_group_by, inplace=True)
        frame.columns = pd.MultiIndex.from_product([[uniq_val], frame.columns], names=[column_to_flatten_by, 'columns'])

    res = frames_to_join[0].join(frames_to_join[1:], how='outer')
    return res


def join_ops_with_flatten_members(ops, flatten_ops_with_members, id_colname='ID', ops_columns_level_name=None):
    if ops_columns_level_name is None:
        ops_columns_level_name = str(list(flatten_ops_with_members.columns.levels[0]))
    ops_to_join = ops.set_index(id_colname, drop=False)
    ops_to_join.columns = pd.MultiIndex.from_product([[ops_columns_level_name], ops_to_join.columns],
                                                     names=flatten_ops_with_members.columns.names)
    joined_ops = ops_to_join.join(flatten_ops_with_members, how='left')
    joined_ops.reset_index(drop=True, inplace=True)
    return joined_ops

### Read data csv and drop some columns and rows from off_members

In [4]:
with Timer('reading susp_ops', True):
    susp_ops = pd.read_csv(os.path.join(path_to_data, 'susp_ops.csv'))
with Timer('reading susp_members', True):
    susp_members = pd.read_csv(os.path.join(path_to_data, 'susp_members.csv'))
with Timer('reading off_ops', True):
    off_ops = pd.read_csv(os.path.join(path_to_data, 'off_ops.csv'))
with Timer('reading off_ops', True):
    off_members = pd.read_csv(os.path.join(path_to_data, 'off_members.csv'))

with Timer('drop messy off_members columns'):
    off_members.drop(['P_DATE_INSERT', 'P_DATE_UPDATE', 'CHANGEDATE'], axis=1, inplace=True)

with Timer('drop off_members with OPERATIONID that are not in off_ops'):
    off_members = off_members[off_members.P_OPERATIONID.isin(off_ops.ID)].copy()
    off_members.reset_index(drop=True, inplace=True)

with Timer('drop off_ops with OPERATIONID that are not in off_ops'):
    off_ops = off_ops[off_ops.ID.isin(off_members.P_OPERATIONID)].copy()
    off_ops.reset_index(drop=True, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


reading susp_ops took 4.106s


  interactivity=interactivity, compiler=compiler, result=result)


reading susp_members took 4.383s


  interactivity=interactivity, compiler=compiler, result=result)


reading off_ops took 50.854s


  interactivity=interactivity, compiler=compiler, result=result)


reading off_ops took 46.074s
drop messy off_members columns took 3.027s
drop off_members with OPERATIONID that are not in off_ops took 10.476s
drop off_ops with OPERATIONID that are not in off_ops took 7.792s


### Clean mess with P_CLIENTID and columns containing nans, drop stupid duplicates, Convert time to datetime format and sort operations by time


In [5]:
fill_values_for_off_members = {
    'P_BSCLIENTID': -1000,
    'P_REGOPENDATE': '0000-00-00 00:00:00',
    'P_BSACCOUNT': -1000,
    'P_BANK': 'UNKNOWN',
    'P_SDP': -1000,
    'P_ORGFORM': -1000,
    'P_BANKCITY': 'UNKNOWN'
}

with Timer('filling NaNs in off_members'):
    off_members.fillna(value=fill_values_for_off_members, inplace=True)

with Timer('off_members client_ids processing'):
    off_members.loc[:, 'P_CLIENTID'], trivial_ids_to_nontrivial = process_client_indices(off_members.P_CLIENTID)

with Timer('dropping duplicate rows from off_members'):
    off_members.drop_duplicates(off_members.columns.drop(['ID', 'P_BSCLIENTID', 'P_BSACCOUNT']), inplace=True)
    off_members.reset_index(drop=True, inplace=True)

with Timer('processing off_ops', True):
    off_ops.loc[:, 'P_OPERATIONDATETIME'] = pd.to_datetime(off_ops['P_OPERATIONDATETIME'])
    off_ops.sort_values(by='P_OPERATIONDATETIME', kind='mergesort', inplace=True)
    off_ops.reset_index(drop=True, inplace=True)

filling NaNs in off_members took 2.321s
off_members client_ids processing took 43.630s
dropping duplicate rows from off_members took 27.133s
processing off_ops took 6.542s


### Map operation ID to 0-based one (assuming our ops already sorted)

In [6]:
with Timer('building counters for unique operation ID'):
    operation_id_uniqs, operation_id_indices = np.unique(off_ops.ID.values, return_index=True)
    operation_id_uniqs = operation_id_uniqs[operation_id_indices.argsort()]
    operationid_counter = {u: i for i, u in enumerate(operation_id_uniqs)}
    operationid_inv_counter = {v: k for k, v in operationid_counter.items()}

del operation_id_uniqs
del operation_id_indices

with Timer('mapping original ID to 0-based for off_ops'):
    off_ops.loc[:, 'ID'] = off_ops['ID'].map(operationid_counter.get)

with Timer('mapping original P_OPERATIONID to 0-based for off_members'):
    off_members.loc[:, 'P_OPERATIONID'] = off_members['P_OPERATIONID'].map(operationid_counter.get)

building counters for unique operation ID took 5.977s
mapping original ID to 0-based for off_ops took 4.651s
mapping original P_OPERATIONID to 0-based for off_members took 7.811s


### Map P_CLIENTID to 0-based indices (to be able to work with sparse client matrices)


In [7]:
with Timer('sort off-members by time-sorted operation ID'):
    off_members.sort_values('P_OPERATIONID', kind='mergesort', inplace=True)

clientids_sorted = off_members['P_CLIENTID'].values

with Timer('build fast uniqs'):
    clientids_uniqs, clientids_indices = np.unique(clientids_sorted[np.isfinite(clientids_sorted)], return_index=True)
    clientids_uniqs = clientids_uniqs[clientids_indices.argsort()]

with Timer('build clientid_counter'):
    clientid_counter = {u: i for i, u in enumerate(clientids_uniqs)}
    clientid_inv_counter = {v: k for k, v in clientid_counter.items()}

with Timer('mapping original P_CLIENTID to 0-based for off_members'):
    off_members.loc[:, 'P_CLIENTID'] = off_members['P_CLIENTID'].map(clientid_counter.get)

sort off-members by time-sorted operation ID took 6.929s
build fast uniqs took 5.244s
build clientid_counter took 0.404s
mapping original P_CLIENTID to 0-based for off_members took 8.340s


### Sort off_members by time, compute seconds_from_start and seconds_from_last_client_op

In [8]:
with Timer('calculating seconds from start for off_ops'):
    off_ops['seconds_from_start'] = (off_ops.P_OPERATIONDATETIME - \
                                                off_ops.P_OPERATIONDATETIME.min()).dt.total_seconds()

off_members.reset_index(drop=True, inplace=True)
off_ops.reset_index(drop=True, inplace=True)

with Timer('joining off_members and seconds_from_start from off_ops'):
    off_members = off_members.merge(off_ops[['ID', 'seconds_from_start']], how='inner', left_on='P_OPERATIONID', right_on='ID')
    off_members.drop(['ID_y'], axis=1, inplace=True)
    off_members.rename(columns={'ID_x': 'ID'}, inplace=True)

with Timer('stable sorting off_members by P_CLIENTID inplace'):
    off_members.sort_values('P_CLIENTID', kind='mergesort', inplace=True)

with Timer('retrieve deltas between client current and last operation'):
    off_members['seconds_from_last_client_op'] = off_members.groupby('P_CLIENTID')['seconds_from_start']\
                .agg('diff')\
                .sort_index(kind='mergesort')\
                .fillna(-100000)

with Timer('sort back off_members by P_OPERATIONID'):
    off_members.sort_values('P_OPERATIONID', kind='mergesort', inplace=True)

calculating seconds from start for off_ops took 1.490s
joining off_members and seconds_from_start from off_ops took 35.163s
stable sorting off_members by P_CLIENTID inplace took 22.342s
retrieve deltas between client current and last operation took 107.782s
sort back off_members by P_OPERATIONID took 14.859s


### Compute acc persistence in days

In [9]:
with Timer('retrieving member operationdatetime for each row from off_members'):
    member_operationdatetime = off_ops.P_OPERATIONDATETIME[off_members.P_OPERATIONID]

with Timer('computing acc_persistence for off_members'):
    regopendate = pd.to_datetime(off_members['P_REGOPENDATE'],errors='coerce', format='%Y-%m-%d %H:%M:%S')
    acc_persistence = (member_operationdatetime.reset_index(drop=True) - regopendate.reset_index(drop=True)).dt.days.copy()
    acc_persistence.loc[acc_persistence.isnull()] = -100000
    off_members['acc_persistence'] = acc_persistence.values
    off_members.reset_index(drop=True, inplace=True)

retrieving member operationdatetime for each row from off_members took 103.967s
computing acc_persistence for off_members took 29.033s


### select columns from off_ops and mark cat/numeric ones
### There also should be adding links info for past week for every op, but it was broken because problems with off_members, should be recalculated again

In [10]:
selected_off_ops_columns = [
    'ID',
    'P_OPERATIONDATETIME',
    'P_ISSUEDBID',
    'P_BRANCH',
    'P_CURRENCYCODE',
    'P_EKNPCODE',
    'P_DOCCATEGORY',
    'P_TOEXTRACTBOOL',
    'P_KFM_OPER_REASON',
    'P_BS_OPER_TYPE',
    'P_WAS_SEND',
    'P_BASEAMOUNT',
    'seconds_from_start',
]

selected_off_ops_ohe_columns = [
    'P_ISSUEDBID',
    'P_CURRENCYCODE',
    'P_EKNPCODE',
    'P_DOCCATEGORY',
    'P_TOEXTRACTBOOL',
    'P_KFM_OPER_REASON',
    'P_BS_OPER_TYPE',
    'P_WAS_SEND'
]

selected_off_ops_numeric_columns = [
    'P_BASEAMOUNT'
]


with Timer('taking selected columns from off_ops'):
    selected_off_ops = off_ops[selected_off_ops_columns].copy()

off_ops_fillna_dict = {
    'P_EKNPCODE': -1000,
    'P_KFM_OPER_REASON': -1000
}

with Timer('filling nans in selected_off_ops'):
    selected_off_ops.fillna(value=off_ops_fillna_dict, inplace=True)

taking selected columns from off_ops took 1.757s
filling nans in selected_off_ops took 0.905s


### Same column selection for off_members + build joined table

In [11]:
selected_off_members_columns = [
    'P_CLIENTID',
    'P_OPERATIONID',
    'P_COUNTRYCODE',
    'P_BANK',
    'P_BANKCITY',
    'P_SDP',
    'P_BANK_CLIENT',
    'P_CLIENT_TYPE',
    'P_CLIENTROLE',
    'P_BANKCOUNTRYCODE',
    'P_BANKNAME',
    'P_ORGFORM',
    'seconds_from_last_client_op',
    'acc_persistence'
]

selected_off_members_ohe_columns = [
    'P_COUNTRYCODE',
    'P_BANK',
    'P_BANKCITY',
    'P_SDP',
    'P_BANK_CLIENT',
    'P_CLIENT_TYPE',
    'P_BANKCOUNTRYCODE',
    'P_BANKNAME',
    'P_ORGFORM',
]

selected_off_members_numeric_columns = [
    'seconds_from_last_client_op',
    'acc_persistence'
]


with Timer('taking selected columns from off_members'):
    selected_off_members = off_members[selected_off_members_columns].copy()

with Timer('building flatten selected members'):
    selected_members_flatten = flatten_df_by_column(
        selected_off_members,
        to_flatten=list(selected_off_members.columns.drop(['P_OPERATIONID', 'P_CLIENTROLE'])),
        flatten_by='P_CLIENTROLE',
        group_by='P_OPERATIONID')

with Timer('join offline operations with selected member columns'):
    joined_off_ops = join_ops_with_flatten_members(selected_off_ops, selected_members_flatten, ops_columns_level_name='all')

taking selected columns from off_members took 3.272s
building flatten selected members took 15.786s
join offline operations with selected member columns took 6.691s


### Build target variable and set joined_off_ops.target with it

In [12]:
susp_ops = susp_ops[~susp_ops.P_OFFLINEOPERATIONID.isnull()].copy()
susp_ops.loc[susp_ops.P_OPERATION_LIST.isnull(), 'P_OPERATION_LIST'] = ''
error_count = 0

with Timer('setting proper labels for learning'):
    target_mapped = defaultdict(lambda: -1)

    for i, row in tqdm(susp_ops[['P_OFFLINEOPERATIONID', 'P_SENDTOKFMBOOL']].iterrows(), total=len(susp_ops)):
        try:
            off_op_id, to_kfm = operationid_counter[int(row.P_OFFLINEOPERATIONID)], row.P_SENDTOKFMBOOL
        except KeyError:
            # lol we found OPERATIONID that is not in off_ops anymore
            error_count += 1
            pass
        if off_op_id not in target_mapped:
            target_mapped[off_op_id] = 0 if to_kfm == 2 else 1
        else:
            cur_target = target_mapped[off_op_id]
            if cur_target == 0 and to_kfm != 2:
                target_mapped[off_op_id] = 1

with Timer('setting susp target for every operation'):
    joined_off_ops['all', 'target'] = joined_off_ops['all', 'ID'].map(target_mapped.__getitem__)

100%|██████████| 401500/401500 [00:16<00:00, 24445.37it/s]

setting proper labels for learning took 16.497s





setting susp target for every operation took 4.238s


In [13]:
joined_off_ops.columns

MultiIndex(levels=[[1, 2, 5, 'all'], ['ID', 'P_BANK', 'P_BANKCITY', 'P_BANKCOUNTRYCODE', 'P_BANKNAME', 'P_BANK_CLIENT', 'P_BASEAMOUNT', 'P_BRANCH', 'P_BS_OPER_TYPE', 'P_CLIENTID', 'P_CLIENT_TYPE', 'P_COUNTRYCODE', 'P_CURRENCYCODE', 'P_DOCCATEGORY', 'P_EKNPCODE', 'P_ISSUEDBID', 'P_KFM_OPER_REASON', 'P_OPERATIONDATETIME', 'P_ORGFORM', 'P_SDP', 'P_TOEXTRACTBOOL', 'P_WAS_SEND', 'acc_persistence', 'seconds_from_last_client_op', 'seconds_from_start', 'target']],
           labels=[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3], [0, 17, 15, 7, 12, 14, 13, 20, 16, 8, 21, 6, 24, 9, 11, 1, 2, 19, 5, 10, 3, 4, 18, 23, 22, 9, 11, 1, 2, 19, 5, 10, 3, 4, 18, 23, 22, 9, 11, 1, 2, 19, 5, 10, 3, 4, 18, 23, 22, 25]],
           names=['P_CLIENTROLE', 'columns'])

In [14]:
joined_off_ops.isnull().mean(0)

P_CLIENTROLE  columns                    
all           ID                             0.000000
              P_OPERATIONDATETIME            0.000000
              P_ISSUEDBID                    0.000000
              P_BRANCH                       0.000000
              P_CURRENCYCODE                 0.000000
              P_EKNPCODE                     0.000000
              P_DOCCATEGORY                  0.000000
              P_TOEXTRACTBOOL                0.000000
              P_KFM_OPER_REASON              0.000000
              P_BS_OPER_TYPE                 0.000000
              P_WAS_SEND                     0.000000
              P_BASEAMOUNT                   0.000000
              seconds_from_start             0.000000
1             P_CLIENTID                     0.132619
              P_COUNTRYCODE                  0.132619
              P_BANK                         0.132619
              P_BANKCITY                     0.132619
              P_SDP                     

In [12]:
graph_multi_index = pd.MultiIndex(levels=[[1, 2, 5, 'all'], ['ID', 'P_BASEAMOUNT', 'P_CLIENTID', 'P_EKNPCODE', 'P_OPERATIONDATETIME', 'seconds_from_start', 'target']],
           labels=[[3, 3, 0, 1, 2, 3, 3, 3, 3], [0, 1, 2, 2, 2, 3, 4, 5, 6]],
           names=['P_CLIENTROLE', 'columns'])

In [13]:
graph_trans_df = joined_off_ops[graph_multi_index].copy()

In [14]:
import pickle

with open('graph_trans_df.pkl', 'wb') as handle:
    pickle.dump(graph_trans_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
graph_index = pd.Index(data=[
    'ID',
    'P_BASEAMOUNT',
    'id_sender',
    'id_receiver',
    'id_benef',
    'P_EKNPCODE',
    'P_OPERATIONDATETIME',
    'seconds_from_start',
    'target'], labels=list(range(9)), names=['columns'])

In [None]:
graph_trans_df.columns = graph_index

In [None]:
with Timer('writing graph_trans_df to disk'):
    graph_trans_df.to_csv('graph_trans_df.csv', index=False)

In [None]:
def get_second_level_columns(df, columns, copy=False):
    if copy:
        return df.loc[:, df.columns.map(lambda x: x[1] in columns)].copy()
    else:
        return df.loc[:, df.columns.map(lambda x: x[1] in columns)]

joined_off_ops_cat = get_second_level_columns(joined_off_ops, 
                                              set(selected_off_ops_ohe_columns).union(set(selected_off_members_ohe_columns)))

joined_off_ops_num = get_second_level_columns(joined_off_ops,
                                              set(selected_off_ops_numeric_columns).union(set(selected_off_members_numeric_columns)))

### Select only operations with existing clients with roles 1 and 2 (transfers)

In [None]:
joined_direct_trans = joined_off_ops.loc[(~joined_off_ops[1, 'P_CLIENTID'].isnull() 
                                          & ~joined_off_ops[2, 'P_CLIENTID'].isnull()), :]

### Drop columns with clientrole 5 (beneficial)

In [None]:
joined_direct_transfers = joined_direct_trans.drop(5, axis=1, level=0)

In [None]:
joined_direct_transfers_cat = get_second_level_columns(joined_direct_transfers, 
                                              set(selected_off_ops_ohe_columns).union(set(selected_off_members_ohe_columns)))

joined_direct_transfers_num = get_second_level_columns(joined_direct_transfers,
                                              set(selected_off_ops_numeric_columns).union(set(selected_off_members_numeric_columns)))

joined_direct_transfers_target = joined_direct_transfers['all', 'target']

In [16]:
print(joined_direct_transfers_num.columns)

MultiIndex(levels=[[1, 2, 5, 'all'], ['ID', 'P_BANK', 'P_BANKCITY', 'P_BANKCOUNTRYCODE', 'P_BANKNAME', 'P_BANK_CLIENT', 'P_BASEAMOUNT', 'P_BRANCH', 'P_BS_OPER_TYPE', 'P_CLIENTID', 'P_CLIENT_TYPE', 'P_COUNTRYCODE', 'P_CURRENCYCODE', 'P_DOCCATEGORY', 'P_EKNPCODE', 'P_ISSUEDBID', 'P_KFM_OPER_REASON', 'P_OPERATIONDATETIME', 'P_ORGFORM', 'P_SDP', 'P_TOEXTRACTBOOL', 'P_WAS_SEND', 'acc_persistence', 'seconds_from_last_client_op', 'seconds_from_start', 'target']],
           labels=[[3, 0, 0, 1, 1], [6, 23, 22, 23, 22]],
           names=['P_CLIENTROLE', 'columns'])


In [17]:
joined_direct_transfers_num.isnull().mean(0)

P_CLIENTROLE  columns                    
all           P_BASEAMOUNT                   0.0
1             seconds_from_last_client_op    0.0
              acc_persistence                0.0
2             seconds_from_last_client_op    0.0
              acc_persistence                0.0
dtype: float64

In [18]:
joined_direct_transfers_num.values[:, 0]

array([151075.01, 136206.83,  59613.06, ...,  10053.23, 265901.94,
        29665.83])

In [None]:
from sklearn.decomposition import PCA

### Here we use n_components = sqrt(len(one_hot))

In [17]:
joined_direct_transfers_cat.nunique()

P_CLIENTROLE  columns          
all           P_ISSUEDBID             5
              P_CURRENCYCODE         29
              P_EKNPCODE            287
              P_DOCCATEGORY          13
              P_TOEXTRACTBOOL         2
              P_KFM_OPER_REASON      11
              P_BS_OPER_TYPE        164
              P_WAS_SEND              3
1             P_COUNTRYCODE          64
              P_BANK                564
              P_BANKCITY             88
              P_SDP                   2
              P_BANK_CLIENT           2
              P_CLIENT_TYPE           3
              P_BANKCOUNTRYCODE      73
              P_BANKNAME            536
              P_ORGFORM              15
2             P_COUNTRYCODE         101
              P_BANK               1357
              P_BANKCITY            250
              P_SDP                   2
              P_BANK_CLIENT           2
              P_CLIENT_TYPE           3
              P_BANKCOUNTRYCODE      86
        

In [None]:
with Timer('building one-hot representation for every categorical column'):
    joined_direct_transfers_ohe_dict = {
        (clientrole, colname): pd.get_dummies(joined_direct_transfers_cat[clientrole][[colname]],
                                columns=[colname],
                                prefix=[colname])
        for clientrole, colname in joined_direct_transfers_cat.columns
    }

with Timer('PCA initializating for every column'):
    pca_ohe = {
        (clientrole, colname): PCA(n_components=max(1, int(np.log1p(len(ohe_df.columns)))))
        for (clientrole, colname), ohe_df in joined_direct_transfers_ohe_dict.items()
    }

In [19]:
train_rate = 0.7
train_size = int(train_rate * len(joined_direct_transfers))

In [22]:
transformed_train_ohe = {}
transformed_test_ohe = {}

for role_colname in tqdm(joined_direct_transfers_cat.columns):
    vals = joined_direct_transfers_ohe_dict[role_colname].values.astype(np.float32)
    transformed_train_ohe[role_colname] = pca_ohe[role_colname].fit_transform(vals[:train_size])
    transformed_test_ohe[role_colname] = pca_ohe[role_colname].transform(vals[train_size:])
    del vals

100%|██████████| 26/26 [19:45<00:00, 45.59s/it]


In [23]:
train_ohe_pca = np.concatenate([transformed_train_ohe[role_colname] for role_colname in joined_direct_transfers_cat.columns], axis=1)
test_ohe_pca = np.concatenate([transformed_test_ohe[role_colname] for role_colname in joined_direct_transfers_cat.columns], axis=1)

In [35]:
#non_ohe_role_columns = (list([('all', colname) for colname in selected_off_ops_numeric_columns])
#                        + list(pd.MultiIndex.from_product([[1, 2, 5], selected_off_members_numeric_columns])))
vals_non_ohe = joined_direct_transfers_num.values.astype(np.float32)
train_non_ohe = vals_non_ohe[:train_size]
test_non_ohe = vals_non_ohe[train_size:]

In [37]:
xtrain, xtest = np.concatenate([train_ohe_pca, train_non_ohe], axis=1), np.concatenate([test_ohe_pca, test_non_ohe], axis=1)
ytrain, ytest = joined_direct_transfers_target.values[:train_size], joined_direct_transfers_target.values[train_size:]

In [40]:
from sklearn.ensemble import IsolationForest

In [43]:
isof = IsolationForest(n_estimators=1000, contamination = (ytrain == 1).mean(), n_jobs=-1)

In [44]:
isof.fit(xtrain)

MemoryError: 

In [47]:
X, y = np.concatenate([xtrain, xtest], 0), np.concatenate([ytrain, ytest], 0)

In [48]:
#np.save('xtrain.npy', xtrain)
#np.save('xtest.npy', xtest)
#np.save('ytrain.npy', ytrain)
#np.save('ytest.npy', ytest)

In [49]:
kek = np.load('xtest.npy')

In [22]:
pd.to_datetime(off_members.P_REGOPENDATE, errors='coerce', format='%Y-%m-%d %H:%M:%S').isnull().mean()

0.7487276525538096

In [25]:
op_datetime = off_ops.P_OPERATIONDATETIME[off_members.P_OPERATIONID.values]

In [None]:
off_members.'P_REGOPENDATE', lambda:  pd.to_datetime(off_members['P_REGOPENDATE'],
                                               errors='coerce',
                                               format='%Y-%m-%d %H:%M:%S'))]

In [None]:
def fill_unfreq(series, min_freq=100, filler='UNFREQ'):
    series = series.copy()
    counts = series.value_counts()
    series[series.isin(counts[counts < min_freq]).index] = filler
    return series

useful_off_members_solumns = [
    'P_REGOPENDATE',
    'P_COUNTRYCODE',
    'P_BANKCITY',
    'P_SDP',
    'P_BANK_CLIENT',
    'P_CLIENT_TYPE',
    'P_CLIENTROLE',
    'P_BANKCOUNTRYCODE',
    'P_BANKNAME',
    'P_ORGFORM',
    'seconds_from_start',
    'seconds_from_last_client_op'
]



    
selected_off_members_columns_processing_pipeline = OrderedDict([
    ('P_REGOPENDATE', lambda:  pd.to_datetime(off_members['P_REGOPENDATE'], errors='coerce', format='%Y-%m-%d %H:%M:%S')),
    ('P_COUNTRYCODE', lambda: fill_unfreq(off_members['P_COUNTRYCODE'])),
    ('P_BANKCITY', lambda: fill_unfreq(off_members['P_BANKCITY'])),
    ('P_SDP', lambda: off_members['P_SDP'].copy()),
    ('P_BANK_CLIENT', lambda: off_members['P_BANK_CLIENT'].copy()),
    ('P_CLIENT_TYPE', lambda: off_members['P_CLIENT_TYPE'].copy()),
    ('P_CLIENTROLE', lambda: off_members['P_CLIENTROLE'].copy()),
    ('P_BANKCOUNTRYCODE', lambda: fill_unfreq(off_members['P_BANKCOUNTRYCODE'], min_freq=75)),
    ('P_BANKNAME', lambda: fill_unfreq(off_members['P_BANKNAME'], min_freq=75)),
    ('P_ORGFORM', lambda: off_members['P_ORGFORM'].copy()),
    ('seconds_from_start', lambda: off_members['seconds_from_start'].copy()),
    ('seconds_from_last_client_op', lambda: off_members['seconds_from_last_client_op'].copy()),
])


simple_off_members_columns_pipeline = OrderedDict(
    [('P_REGOPENDATE', lambda:  pd.to_datetime(off_members['P_REGOPENDATE'],
                                               errors='coerce',
                                               format='%Y-%m-%d %H:%M:%S'))]
    + [colname, lambda: off_members[colname].copy() for colname in useful_off_members_solumns[1:]])


with Timer('process selected fields from off_members'):
    # There are some weird dates in P_REGOPENDATE, raising errors will force them to be  NaT
    p_regopendate = pd.to_datetime(off_members.P_REGOPENDATE, errors='coerce', format='%Y-%m-%d %H:%M:%S')

    # Rare countrycodes would be set as 'UNFREQ', NaNs will be 'UNKNOWN'
    p_countrycode = off_members.P_COUNTRYCODE.copy()
    countrycode_counts = p_countrycode.value_counts()
    p_countrycode[p_countrycode.isin(countrycode_counts[countrycode_counts <= 100].index)] = 'UNFREQ'
    p_countrycode[p_countrycode.isnull()] = 'UNKNOWN'

    # Same for bankcity
    p_bankcity = off_members.P_BANKCITY.copy()
    bankcity_counts = p_bankcity.value_counts()
    p_bankcity[p_bankcity.isin(bankcity_counts[bankcity_counts <= 100].index)] = 'UNFREQ'
    p_bankcity[p_bankcity.isnull()] = 'UNKNOWN'

    # for bank there is only a few NaN values, so it will be 'UNFREQ' aswell
    p_bank = off_members.P_BANK.copy()
    bank_counts = p_bank.value_counts()
    p_bank[p_bank.isin(bank_counts[bank_counts <= 100].index)] = 'UNFREQ'
    p_bank[p_bank.isnull()] = 'UNFREQ'

    # SDP nas only NaN/13.0 values, so simply encode NaN with -1k
    p_sdp = off_members.P_SDP.copy()
    p_sdp[p_sdp.isnull()] = -1000

    # No missing values
    p_bank_client = off_members.P_BANK_CLIENT.copy()

    # No missing values
    p_client_type = off_members.P_CLIENT_TYPE.copy()

    # No missing values
    p_clientrole = off_members.P_CLIENTROLE.copy()
    p_seconds_from_start = off_members.seconds_from_start.copy()
    p_seconds_from_last_client_op = off_members.seconds_from_last_client_op.copy()

In [11]:
off_members.isnull().mean(0)

ID                             0.0
P_OPERATIONID                  0.0
P_CLIENTID                     0.0
P_BSCLIENTID                   0.0
P_BANK_CLIENT                  0.0
P_REGOPENDATE                  0.0
P_COUNTRYCODE                  0.0
P_CLIENT_TYPE                  0.0
P_CLIENTROLE                   0.0
P_CLIENTKIND                   0.0
P_ACCOUNT                      0.0
P_BSACCOUNT                    0.0
P_BANK                         0.0
P_BANKCOUNTRYCODE              0.0
P_BANKNAME                     0.0
P_IPDL                         0.0
P_USERNAME                     0.0
P_SDP                          0.0
P_ORGFORM                      0.0
P_BANKCITY                     0.0
P_OPER_DATE                    0.0
P_OPERATIONDATE                0.0
seconds_from_start             0.0
seconds_from_last_client_op    0.0
dtype: float64