In [1]:
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
import os
import sys
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import get_categorical_features, get_numeric_features, reduce_mem_usage
import datetime

from tqdm import tqdm
import time
import sys

os.listdir('../input/')
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_avtive_month']

### Data Load

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

df_train = utils.read_df_pkl('../input/train0*')
df_test = utils.read_df_pkl('../input/test0*')
df_train.set_index(key, inplace=True)
df_test.set_index(key, inplace=True)
df_hist = utils.read_df_pkl('../input/histori*0*')
df_new = utils.read_df_pkl('../input/new_mer*0*')

df_train = reduce_mem_usage(df_train)
df_test  = reduce_mem_usage(df_test )
df_hist  = reduce_mem_usage(df_hist )
df_new   = reduce_mem_usage(df_new  )

100%|██████████| 3/3 [00:00<00:00, 96.84it/s]
100%|██████████| 3/3 [00:00<00:00, 192.13it/s]
100%|██████████| 3/3 [00:26<00:00,  8.67s/it]
100%|██████████| 3/3 [00:00<00:00,  3.61it/s]


Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)
Mem. usage decreased to 1971.22 Mb (40.8% reduction)
Mem. usage decreased to 140.41 Mb (37.5% reduction)


In [3]:
# first acitve monthのimpute
# df_first = df_hist[[key, 'purchase_date']].groupby(key)['purchase_date'].min()
# df_first.name = 'first_date'
# df_first =  df_first.map(lambda x: str(x)[:7])

# df_train = df_train.join(df_first)
# df_train['first_active_month'] =  df_train[['first_active_month', 'first_date']].apply(lambda x:x[1]  if x[0]!=x[0] else x[0], axis=1)
# df_test = df_test.join(df_first)
# df_test['first_active_month'] =  df_test[['first_active_month', 'first_date']].apply(lambda x:x[1]  if x[0]!=x[0] else x[0], axis=1)

df_test['first_active_month'].fillna('2017-03')

train_test = pd.concat([df_train, df_test], axis=0).reset_index()
df_hist = df_hist.merge(train_test[[key, 'first_active_month']], how='inner', on=key)
df_new = df_new.merge(train_test[[key, 'first_active_month']], how='inner', on=key)

In [4]:
for df in [df_hist, df_new]:
    df['category_1'] = df['category_1'].map(lambda x: 0 if x=='N' else 1)
    df['authorized_flag'] = df['authorized_flag'].map(lambda x: 0 if x=='N' else 1)
    df['purchase_amount'] = df['purchase_amount'] - df['purchase_amount'].min() + 1.0e-5
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['all_term'] = (pd.to_datetime('2018-05-01') - df['first_active_month']).dt.days
    df['yyyymm'] = df['purchase_date'].map(lambda x: str(x)[:7])
    df['yyyymmdd'] = df['purchase_date'].map(lambda x: str(x)[:11])
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['yyyy_week'] = df['yyyymm'].map(lambda x: str(x)[:4]) + df['weekofyear'].map(lambda x: '-0' + str(x) if len(str(x))==1 else '-' + str(x) )
df_trans = pd.concat([df_hist, df_new], axis=0)
df_hist  = reduce_mem_usage(df_hist )
df_new   = reduce_mem_usage(df_new  )

Mem. usage decreased to 2554.26 Mb (25.2% reduction)
Mem. usage decreased to 179.72 Mb (22.0% reduction)


In [35]:
### purchase amountは2で丸める。3以上はhist全体の1%くらい
df_hist['purchase_amount_over'] = df_hist['purchase_amount'].where(df_hist['purchase_amount']>2, np.nan)
df_hist['purchase_amount_over'] = df_hist['purchase_amount_over'].where(df_hist['purchase_amount_over']<300, np.nan)
df_hist['purchase_amount'] = df_hist['purchase_amount'].where(df_hist['purchase_amount']<2, 2)
df_new['purchase_amount_over'] = df_new['purchase_amount'].where(df_new['purchase_amount']>2, np.nan)
df_new['purchase_amount_over'] = df_new['purchase_amount_over'].where(df_new['purchase_amount_over']<300, np.nan)
df_new['purchase_amount'] = df_new['purchase_amount'].where(df_new['purchase_amount']<2, 2)

# overは0埋め
df_hist['purchase_amount_over'].fillna(0, inplace=True)
df_new['purchase_amount_over'].fillna(0, inplace=True)

### histのinstallmentsで-1と999はNanにする
df_hist['installments'] =  df_hist['installments'].where(df_hist['installments']>=0, np.nan)
df_hist['installments'] =  df_hist['installments'].where(df_hist['installments']<100, np.nan)
df_hist['installments'].value_counts()

### installmentsはregression imputeがよさげ

In [121]:
# result pred installments
tmp = utils.read_pkl_gzip('../stack/0114_105_stack_lgb_lr0.1_17feats_1seed_31leaves_iter10000_OUT0_CV0-0005918784915601258_LB.gz')
tmp2 = tmp[tmp['installments'].isnull()]
# indexとするためにもとのテストデータをとってくる
test_set = utils.read_df_pkl('../input/pred_installments_test_set0*')
test_set['installments'] = tmp2['prediction'].astype('int8').values
test_set['purchase_date'] = pd.to_datetime( test_set['purchase_date'])
test_set = test_set[[key, 'purchase_date', 'installments']]
test_set.set_index([key, 'purchase_date'], inplace=True)

df_hist.reset_index(inplace=True)
df_hist['purchase_date'] = pd.to_datetime( df_hist['purchase_date'])
df_hist.set_index([key, 'purchase_date'], inplace=True)
df_hist.loc[df_hist['installments'].isnull(), 'installments'] = test_set['installments']
df_hist.reset_index(inplace=True)


  0%|          | 0/3 [00:00<?, ?it/s][A
100%|██████████| 3/3 [00:00<00:00, 41.37it/s][A

Unnamed: 0_level_0,Unnamed: 1_level_0,installments
card_id,purchase_date,Unnamed: 2_level_1
C_ID_fc8e41b9cf,2018-01-23 12:05:13,0
C_ID_fc8e41b9cf,2018-02-21 07:24:59,0
C_ID_fc8e41b9cf,2017-11-13 00:00:00,0
C_ID_fc8e41b9cf,2018-01-09 14:15:12,0
C_ID_fc8e41b9cf,2018-01-23 14:04:39,0


In [5]:
# df_hist  = reduce_mem_usage(df_hist )
# df_new   = reduce_mem_usage(df_new  )
utils.to_df_pkl(df=df_hist, path='../input/', fname='hist_exist_null_clean_rdm')
utils.to_df_pkl(df=df_new, path='../input/', fname='new_exist_null_clean_rdm')

0it [00:00, ?it/s]

shape: (29112361, 20)


3it [00:31, 10.70s/it]
0it [00:00, ?it/s]

shape: (1963031, 20)


3it [00:02,  1.45it/s]


In [133]:
# all month_lagをもたせたDF
col_month = 'month_lag'
df_month = df_trans[[key, col_month]].drop_duplicates()
df_month['value'] = 1
df_month = df_month.pivot_table(index=key, columns=col_month, values='value').fillna(1).stack()
df_month = df_month.reset_index().drop(0, axis=1)

df_trans = pd.concat([df_hist, df_new], axis=0)
df_trans_all_m = df_month.merge(df_trans, how='left', on=[key, col_month])

In [134]:
# last 3month
# 0,1,2
df_last1 = df_trans_all_m[df_trans_all_m[col_month]>=0]
# -3,-2,-1
df_last2 = df_trans_all_m[df_trans_all_m[col_month]>=-3]
df_last2 = df_last2[df_last2[col_month]<=-1]
# -6,-5,-4
df_last3 = df_trans_all_m[df_trans_all_m[col_month]>=-6]
df_last3 = df_last3[df_last3[col_month]<=-4]
# -9,-8,-7
df_last4 = df_trans_all_m[df_trans_all_m[col_month]>=-9]
df_last4 = df_last4[df_last4[col_month]<=-7]

In [135]:
utils.to_df_pkl(df=df_last1, path='../input/', fname='trans_last1')
utils.to_df_pkl(df=df_last2, path='../input/', fname='trans_last2')
utils.to_df_pkl(df=df_last3, path='../input/', fname='trans_last3')
utils.to_df_pkl(df=df_last4, path='../input/', fname='trans_last4')


0it [00:00, ?it/s][A

shape: (5637062, 21)



1it [00:04,  4.34s/it][A
2it [00:08,  4.34s/it][A
3it [00:13,  4.34s/it][A
[A
0it [00:00, ?it/s][A

shape: (10703337, 21)



1it [00:09,  9.33s/it][A
2it [00:18,  9.32s/it][A
3it [00:27,  9.30s/it][A
[A
0it [00:00, ?it/s][A

shape: (7397061, 21)



1it [00:05,  5.85s/it][A
2it [00:11,  5.85s/it][A
3it [00:17,  5.84s/it][A
[A
0it [00:00, ?it/s][A

shape: (4981096, 21)



1it [00:03,  3.41s/it][A
2it [00:06,  3.40s/it][A
3it [00:10,  3.40s/it][A
[A