In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder

In [3]:
data_folder = '../../data/raif/'

In [4]:
data = pd.read_csv(os.path.join(data_folder, 'transactions_last_2.csv'), delimiter=';')

In [5]:
data.head()

Unnamed: 0,purchdate,amount,mcc,mrchcity,mrchname,cnum
0,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
1,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
2,2019-10-25 00:00:00,7716900.0,5691,MOSCOW,HXL4K TL4EEXD,ELETCO
3,2019-11-29 00:00:00,5100000.0,6513,SANKT-PETERBU,7X 1E7HXD H1DKW1IKCL,EL0KSF
4,2019-12-03 00:00:00,5023956.6,5944,MOSKVA,73 7HJXK7 (W7X),MPKIJJ


In [6]:
data.shape

(104161408, 6)

In [7]:
data['cnum'].nunique()

1355925

In [8]:
status =  pd.read_csv(os.path.join(data_folder, 'clients_last_2_fixed.csv'), delimiter=';')

In [9]:
status.head()

Unnamed: 0,cnum_,categorycode,gender,age,married_,residenttype
0,0CCCDO,81,M,32,not_married,R
1,0CCCFO,70,F,42,not_married,R
2,0CCCGC,50,F,33,married,R
3,0CCCGG,70,M,58,not_married,R
4,0CCCGO,60,M,38,married,R


In [10]:
status['cnum_'].nunique()

1355925

In [11]:
status['married_'].unique()

array(['not_married', 'married'], dtype=object)

### Balancing number of married / not_married id

In [12]:
status['married_'].value_counts() / len(status) * 100

not_married    64.433579
married        35.566421
Name: married_, dtype: float64

In [13]:
status.groupby('gender')['married_'].value_counts() / len(status) * 100

gender  married_   
F       not_married    32.564117
        married        14.673599
M       not_married    31.869462
        married        20.892822
Name: married_, dtype: float64

### Composing transaction histories for each id

In [14]:
data.head()

Unnamed: 0,purchdate,amount,mcc,mrchcity,mrchname,cnum
0,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
1,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
2,2019-10-25 00:00:00,7716900.0,5691,MOSCOW,HXL4K TL4EEXD,ELETCO
3,2019-11-29 00:00:00,5100000.0,6513,SANKT-PETERBU,7X 1E7HXD H1DKW1IKCL,EL0KSF
4,2019-12-03 00:00:00,5023956.6,5944,MOSKVA,73 7HJXK7 (W7X),MPKIJJ


In [15]:
data['mcc'].nunique()

394

In [16]:
data.shape

(104161408, 6)

In [17]:
data.head()

Unnamed: 0,purchdate,amount,mcc,mrchcity,mrchname,cnum
0,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
1,2019-11-02 00:00:00,9000000.0,5691,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
2,2019-10-25 00:00:00,7716900.0,5691,MOSCOW,HXL4K TL4EEXD,ELETCO
3,2019-11-29 00:00:00,5100000.0,6513,SANKT-PETERBU,7X 1E7HXD H1DKW1IKCL,EL0KSF
4,2019-12-03 00:00:00,5023956.6,5944,MOSKVA,73 7HJXK7 (W7X),MPKIJJ


In [18]:
encoder = LabelEncoder()
data['mcc'] = encoder.fit_transform(data['mcc'])

In [20]:
len(np.unique(data['mcc']))

394

In [21]:
np.max(data['mcc'])

393

In [185]:
data['purchdate'] = pd.to_datetime(data['purchdate'], format='%Y-%m-%d %H:%M:%S')

In [186]:
data.head()

Unnamed: 0,purchdate,amount,mcc,mrchcity,mrchname,cnum
0,2019-11-02,9000000.0,214,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
1,2019-11-02,9000000.0,214,MOSCOW,IA7D1H Z1D1T7,MJNLMQ
2,2019-10-25,7716900.0,214,MOSCOW,HXL4K TL4EEXD,ELETCO
3,2019-11-29,5100000.0,286,SANKT-PETERBU,7X 1E7HXD H1DKW1IKCL,EL0KSF
4,2019-12-03,5023956.6,247,MOSKVA,73 7HJXK7 (W7X),MPKIJJ


In [187]:
data.sort_values(by='purchdate', inplace=True)

In [188]:
data.head()

Unnamed: 0,purchdate,amount,mcc,mrchcity,mrchname,cnum
46308876,2019-09-01,428.2,236,N.NOVGOROD,7WE1IADOO WLD3E MBV5,EEUMYJ
99913237,2019-09-01,30.0,231,KIROV,Trwhmwc-Talunl,ELNJV5
99913235,2019-09-01,30.0,231,KIROV,Trwhmwc-Talunl,ELNJV5
24234483,2019-09-01,1140.0,130,SANKT-PETERBU,ZHX07H 7TEX,EE01QH
99913230,2019-09-01,30.0,231,KIROV,Trwhmwc-Talunl,ELNJV5


In [189]:
df = data.groupby('cnum')['mcc'].agg(lambda x: list(x))

In [190]:
df = pd.DataFrame(df)
df.reset_index(inplace=True)

In [191]:
df.head()

Unnamed: 0,cnum,mcc
0,0CCCDO,"[269, 187, 166, 282, 212, 282, 208, 269, 187, ..."
1,0CCCFO,"[281, 281, 281, 281]"
2,0CCCGC,"[187, 192, 187, 198, 127, 231, 231, 229, 187, ..."
3,0CCCGG,"[187, 198, 187, 198, 127, 187, 187, 198, 187, ..."
4,0CCCGO,"[187, 187, 130, 130, 236, 187, 231, 231, 187, ..."


In [192]:
df = df.merge(status, how='inner', left_on='cnum', right_on='cnum_')

In [193]:
df = df[['cnum', 'mcc', 'married_']]

In [194]:
df.head()

Unnamed: 0,cnum,mcc,married_
0,0CCCDO,"[269, 187, 166, 282, 212, 282, 208, 269, 187, ...",not_married
1,0CCCFO,"[281, 281, 281, 281]",not_married
2,0CCCGC,"[187, 192, 187, 198, 127, 231, 231, 229, 187, ...",married
3,0CCCGG,"[187, 198, 187, 198, 127, 187, 187, 198, 187, ...",not_married
4,0CCCGO,"[187, 187, 130, 130, 236, 187, 231, 231, 187, ...",married


In [195]:
df['married_'] = df['married_'].map({'not_married': 0, 'married': 1})

In [196]:
df.head()

Unnamed: 0,cnum,mcc,married_
0,0CCCDO,"[269, 187, 166, 282, 212, 282, 208, 269, 187, ...",0
1,0CCCFO,"[281, 281, 281, 281]",0
2,0CCCGC,"[187, 192, 187, 198, 127, 231, 231, 229, 187, ...",1
3,0CCCGG,"[187, 198, 187, 198, 127, 187, 187, 198, 187, ...",0
4,0CCCGO,"[187, 187, 130, 130, 236, 187, 231, 231, 187, ...",1


In [197]:
df.shape

(1355925, 3)

In [198]:
all_l = []
for i in range(len(df)):
    all_l.append(len(df.loc[i, 'mcc']))

In [199]:
np.median(all_l)

36.0

In [200]:
np.max(all_l)

2690

In [201]:
np.quantile(all_l, 0.3)

12.0

In [202]:
df = df.loc[df['mcc'].apply(lambda x: len(x)) >= 10]

In [203]:
df.shape

(993528, 3)

In [204]:
df.head()

Unnamed: 0,cnum,mcc,married_
0,0CCCDO,"[269, 187, 166, 282, 212, 282, 208, 269, 187, ...",0
2,0CCCGC,"[187, 192, 187, 198, 127, 231, 231, 229, 187, ...",1
3,0CCCGG,"[187, 198, 187, 198, 127, 187, 187, 198, 187, ...",0
4,0CCCGO,"[187, 187, 130, 130, 236, 187, 231, 231, 187, ...",1
5,0CCCGS,"[187, 187, 187, 130, 325, 229, 359, 187, 127, ...",0


In [205]:
df['mcc'] = df['mcc'].apply(lambda x: x[-200:])

In [206]:
df.head()

Unnamed: 0,cnum,mcc,married_
0,0CCCDO,"[269, 187, 166, 282, 212, 282, 208, 269, 187, ...",0
2,0CCCGC,"[187, 192, 187, 198, 127, 231, 231, 229, 187, ...",1
3,0CCCGG,"[187, 198, 187, 198, 127, 187, 187, 198, 187, ...",0
4,0CCCGO,"[131, 231, 187, 231, 231, 231, 187, 231, 231, ...",1
5,0CCCGS,"[289, 229, 325, 128, 130, 130, 130, 187, 192, ...",0


In [207]:
df.rename({'cnum': 'id', 'married_': 'target'}, axis=1, inplace=True)

In [208]:
df.head()

Unnamed: 0,id,mcc,target
0,0CCCDO,"[269, 187, 166, 282, 212, 282, 208, 269, 187, ...",0
2,0CCCGC,"[187, 192, 187, 198, 127, 231, 231, 229, 187, ...",1
3,0CCCGG,"[187, 198, 187, 198, 127, 187, 187, 198, 187, ...",0
4,0CCCGO,"[131, 231, 187, 231, 231, 231, 187, 231, 231, ...",1
5,0CCCGS,"[289, 229, 325, 128, 130, 130, 130, 187, 192, ...",0


In [209]:
df['target'].value_counts()

0    632738
1    360790
Name: target, dtype: int64

In [210]:
zero_id = df.loc[df['target']==0]['id']

In [211]:
one_id = df.loc[df['target']==1]['id']

In [212]:
reduced_zero_id = np.random.choice(zero_id, size=len(df.loc[df['target']==1]), replace=False)

In [213]:
df = df.loc[df['id'].isin(list(one_id)+list(reduced_zero_id))]

In [214]:
df.head()

Unnamed: 0,id,mcc,target
0,0CCCDO,"[269, 187, 166, 282, 212, 282, 208, 269, 187, ...",0
2,0CCCGC,"[187, 192, 187, 198, 127, 231, 231, 229, 187, ...",1
4,0CCCGO,"[131, 231, 187, 231, 231, 231, 187, 231, 231, ...",1
5,0CCCGS,"[289, 229, 325, 128, 130, 130, 130, 187, 192, ...",0
6,0CCCHG,"[370, 214, 187, 187, 187, 229, 187, 187, 130, ...",1


In [215]:
df.reset_index(inplace=True, drop=True)

In [216]:
df['target'].sum() / len(df) * 100

50.0

In [217]:
df = df.sample(frac=1).reset_index(drop=True)

In [218]:
df.shape

(721580, 3)

In [219]:
df.head()

Unnamed: 0,id,mcc,target
0,EXCSSG,"[184, 185, 187, 236, 175, 187, 187, 236, 236, ...",0
1,8E4SUG,"[187, 231, 187, 187, 185, 231, 187, 231, 187, ...",0
2,RTPLNJ,"[198, 192, 231, 282, 198, 282, 236, 231, 231, ...",1
3,TMKMKL,"[281, 281, 187, 187, 279, 187, 187, 187, 187, ...",1
4,4HDXOX,"[187, 192, 223, 365, 365, 192, 192, 187, 187, ...",0


In [220]:
df['target'].sum() / len(df)

0.5

### Train-valid-test split

In [221]:
all_id = df['id'].values

In [222]:
train_id = all_id[:int(0.7 * len(all_id))]
valid_id = all_id[int(0.7 * len(all_id)):int(0.8 * len(all_id))]
test_id = all_id[int(0.8 * len(all_id)):]

In [223]:
train_df = df.loc[df['id'].isin(train_id)]
valid_df = df.loc[df['id'].isin(valid_id)]
test_df = df.loc[df['id'].isin(test_id)]

In [228]:
train_df['target'].sum() / len(train_df), valid_df['target'].sum() / len(valid_df), \
test_df['target'].sum() / len(test_df)

(0.4998030112550856, 0.49845480120289914, 0.5014620693478201)

In [224]:
len(train_id) / len(df), len(valid_id) / len(df), len(test_id) / len(df)

(0.6999986141522769, 0.10000138584772306, 0.2)

In [225]:
train_df.reset_index(inplace=True, drop=True)
valid_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [226]:
!pwd

/notebook/poison/poisonattack/DataPreprocessing


In [227]:
train_df.to_csv('../../data/processed_raif/train.csv')
valid_df.to_csv('../../data/processed_raif/valid.csv')
test_df.to_csv('../../data/processed_raif/test.csv')