In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder

In [113]:
data_folder = '../../data/age/'

In [114]:
data = pd.read_csv(os.path.join(data_folder, 'transactions_train.csv'), delimiter=',')

In [115]:
data.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [116]:
enc = LabelEncoder()
data['small_group'] = enc.fit_transform(data['small_group'])

In [117]:
data.sort_values(by='trans_date', axis=0, inplace=True)

In [118]:
data.reset_index(inplace=True, drop=True)

In [119]:
data.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,44379,0,52,62.535
1,43594,0,125,10.524
2,43594,0,36,86.255
3,5882,0,12,5.132
4,5882,0,18,11.678


In [120]:
len(np.unique(data['small_group']))

202

In [121]:
df = data.groupby('client_id')['small_group'].agg(lambda x: list(x))

In [122]:
df = pd.DataFrame(df)
df.reset_index(inplace=True)

In [123]:
df.head()

Unnamed: 0,client_id,small_group
0,4,"[1, 3, 1, 1, 1, 4, 1, 1, 36, 15, 1, 1, 34, 1, ..."
1,6,"[15, 3, 1, 3, 1, 15, 15, 3, 37, 11, 3, 1, 3, 1..."
2,7,"[3, 19, 1, 4, 11, 25, 3, 1, 18, 1, 1, 25, 9, 2..."
3,10,"[9, 19, 1, 0, 11, 1, 18, 24, 1, 15, 1, 3, 1, 1..."
4,11,"[3, 25, 1, 1, 11, 1, 25, 1, 1, 11, 3, 1, 22, 2..."


In [124]:
target = pd.read_csv(os.path.join(data_folder, 'train_target.csv'), delimiter=',')

In [125]:
target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [126]:
df = df.merge(target, how='inner', on='client_id')

In [127]:
df.head()

Unnamed: 0,client_id,small_group,bins
0,4,"[1, 3, 1, 1, 1, 4, 1, 1, 36, 15, 1, 1, 34, 1, ...",1
1,6,"[15, 3, 1, 3, 1, 15, 15, 3, 37, 11, 3, 1, 3, 1...",1
2,7,"[3, 19, 1, 4, 11, 25, 3, 1, 18, 1, 1, 25, 9, 2...",0
3,10,"[9, 19, 1, 0, 11, 1, 18, 24, 1, 15, 1, 3, 1, 1...",3
4,11,"[3, 25, 1, 1, 11, 1, 25, 1, 1, 11, 3, 1, 22, 2...",3


In [128]:
df['bins'].value_counts()

2    7560
3    7509
1    7497
0    7434
Name: bins, dtype: int64

In [129]:
df['bins'] = df['bins'].apply(lambda x: 0 if x in [0, 1] else 1)

In [130]:
df.head()

Unnamed: 0,client_id,small_group,bins
0,4,"[1, 3, 1, 1, 1, 4, 1, 1, 36, 15, 1, 1, 34, 1, ...",0
1,6,"[15, 3, 1, 3, 1, 15, 15, 3, 37, 11, 3, 1, 3, 1...",0
2,7,"[3, 19, 1, 4, 11, 25, 3, 1, 18, 1, 1, 25, 9, 2...",0
3,10,"[9, 19, 1, 0, 11, 1, 18, 24, 1, 15, 1, 3, 1, 1...",1
4,11,"[3, 25, 1, 1, 11, 1, 25, 1, 1, 11, 3, 1, 22, 2...",1


In [131]:
df['bins'].value_counts()

1    15069
0    14931
Name: bins, dtype: int64

In [132]:
df.rename({'client_id': 'id', 'small_group': 'mcc', 'bins': 'target'}, axis=1, inplace=True)

In [133]:
df.head()

Unnamed: 0,id,mcc,target
0,4,"[1, 3, 1, 1, 1, 4, 1, 1, 36, 15, 1, 1, 34, 1, ...",0
1,6,"[15, 3, 1, 3, 1, 15, 15, 3, 37, 11, 3, 1, 3, 1...",0
2,7,"[3, 19, 1, 4, 11, 25, 3, 1, 18, 1, 1, 25, 9, 2...",0
3,10,"[9, 19, 1, 0, 11, 1, 18, 24, 1, 15, 1, 3, 1, 1...",1
4,11,"[3, 25, 1, 1, 11, 1, 25, 1, 1, 11, 3, 1, 22, 2...",1


In [134]:
df.shape

(30000, 3)

In [135]:
all_l = []
for i in range(len(df)):
    all_l.append(len(df.loc[i, 'mcc']))

In [136]:
np.median(all_l)

863.0

In [137]:
np.quantile(all_l, 0.1)

728.0

In [138]:
np.quantile(all_l, 0.6)

904.0

In [139]:
np.max(all_l)

1150

In [140]:
df['mcc'] = df['mcc'].apply(lambda x: x[-900:])

In [141]:
df['target'].value_counts()

1    15069
0    14931
Name: target, dtype: int64

In [142]:
zero_id = df.loc[df['target']==0]['id']
one_id = df.loc[df['target']==1]['id']

reduced_one_id = np.random.choice(one_id, size=len(df.loc[df['target']==0]), replace=False)
df = df.loc[df['id'].isin(list(zero_id)+list(reduced_one_id))]
df.reset_index(inplace=True, drop=True)
df['target'].sum() / len(df) * 100

50.0

In [143]:
df = df.sample(frac=1).reset_index(drop=True)

### Train-valid-test split

In [144]:
all_id = df['id'].values

In [145]:
train_id = all_id[:int(0.7 * len(all_id))]
valid_id = all_id[int(0.7 * len(all_id)):int(0.8 * len(all_id))]
test_id = all_id[int(0.8 * len(all_id)):]

In [146]:
train_df = df.loc[df['id'].isin(train_id)]
valid_df = df.loc[df['id'].isin(valid_id)]
test_df = df.loc[df['id'].isin(test_id)]

In [147]:
train_df['target'].sum() / len(train_df), valid_df['target'].sum() / len(valid_df), \
test_df['target'].sum() / len(test_df)

(0.502368081136679, 0.4939718687206966, 0.49472626820693116)

In [148]:
len(train_id) / len(df), len(valid_id) / len(df), len(test_id) / len(df)

(0.6999866050498962, 0.09999330252494809, 0.20002009242515573)

In [149]:
train_df.reset_index(inplace=True, drop=True)
valid_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [150]:
train_df.to_csv('../../data/processed_age/train.csv')
valid_df.to_csv('../../data/processed_age/valid.csv')
test_df.to_csv('../../data/processed_age/test.csv')