In [1]:
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
import os
import sys
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import get_categorical_features, get_numeric_features
import eda

from tqdm import tqdm

os.listdir('../input/')
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id']

feat_no = '108_his_'

In [2]:
import dask.dataframe as dd
train = dd.read_csv('../input/train.csv')
test = dd.read_csv('../input/test.csv')
history = dd.read_csv('../input/historical_transactions.csv')

### Data Load

In [None]:
def elo_load_data(filename=''):
    
    if len(filename):
        df = utils.read_df_pkl(path=f'../input/{filename}*.p')
        return df
    # read pickle
    path_list = glob.glob("../input/*.p")
    for path in path_list:
        filename = re.search(r'/([^/.]*).gz', path).group(1)
        df = utils.read_df_pkl(path=f'../input/{filename}*.p')
train = elo_load_data('train')
test = elo_load_data('test')
train_id = train[key].values
test_id = test[key].values
train_test = pd.concat([train, test], axis=0).reset_index(drop=True)
history = elo_load_data('historical')

100%|██████████| 3/3 [00:11<00:00,  3.56s/it]


#### New Transactionsの最少粒度は下記

In [118]:
print(new.shape)
cols = [key, 'purchase_date', 'purchase_amount', 'merchant_id', 'merchant_category_id', 'installments']
print(new[cols].drop_duplicates().shape)

(1963031, 14)
(1963031, 6)


SystemExit: 

### New Transactionsは一度の購買で数商品買っていると同じ時間で複数レコードが入る。そこの集計方法によりfeatureのあり方が変わる

### 直近購買日とのdiffと、そられの比率を特徴にする.

In [None]:
from joblib import Parallel, delayed
from dateutil.parser import parse

# ========================================================================
# Time Aggregate Function
# ========================================================================
# def diff_of_days(day1, day2):
def diff_of_days(args):
    uid = args[0]
    day1 = args[1]
    day2 = args[2]
    if str(type(day1)).count('int') or str(type(day2)).count('int'):
        return [day1, day2]
    days = (parse(day1) - parse(day2)).days
    return [uid, days, day1]

# 前回購買日との差分
def diff_shift_date(df, lag):
    
    # shiftは正が下方向、負が上方向にズレる
    df = df[[key, 'purchase_date']].drop_duplicates()
    df['shift1_date'] = df.groupby(key)['purchase_date'].shift(lag)
    df = df[[key, 'purchase_date', 'shift1_date']].dropna()
    id_list = df[key].values
    shift1 = df['shift1_date'].values
    shift0 = df['purchase_date'].values
    p_list = Parallel(n_jobs=-1)([delayed(diff_of_days)(args) for args in zip(id_list, shift0, shift1)])
    
    df_diff = pd.DataFrame(np.array(p_list), columns=[key, f'diff_days_lag{lag}@', 'purchase_date'])
    return df_diff
    
for i in tqdm(range(1, 4, 1)):
    diff = diff_shift_date(history, i)
    history = history.merge(diff, how='left', on=[key, 'purchase_date']) 
    
for col in [col for col in history.columns if col.count('@')]:
    history[col] = history[col].astype('float')
    
#========================================================================
# 比率を計算
#========================================================================
history['ratio_days_lag1_2@'] = history['diff_days_lag1@'] / history['diff_days_lag2@']
history['ratio_days_lag1_3@'] = history['diff_days_lag1@'] / history['diff_days_lag3@']
history['ratio_days_lag2_3@'] = history['diff_days_lag2@'] / history['diff_days_lag3@']

  0%|          | 0/3 [00:00<?, ?it/s]

purchase_dateには重複がある。同じ日時で複数の購買がある場合の集計を考える  
日付ベースで集計をしたとしても、その後にそれもまとめる必要がある。  
make 2pattern feature  
1.  
date level aggregation after id level aggregation  
2.  
id level aggregation -> 単純集計は101で実施済

### 1. date level aggregation  
同じ時間帯に購買のあるidは約1,800で全体の1%.なのでそこまで作りこむ必要はなさそう

In [None]:
key_date = [key, 'purchase_date']
# 同時刻の購買数(1,800 id have same datetime.)
cnt = history.groupby(key_date).size()
ins_mean = history.groupby(key_date)['installments'].mean().reset_index()
amt_mean = history.groupby(key_date)['purchase_amount'].mean().reset_index()

# date agg 後の集計
ins_mean = ins_mean.groupby(key)['installments'].agg({
    'installments_max@':'max',
    'installments_min@':'min',
    'installments_mean@':'mean',
    'installments_std@':'std',
    'installments_skew@':'skew',
})
ins_mean['installments_max-min@'] = ins_mean['installments_max@'] - ins_mean['installments_min@']

amt_mean = amt_mean.groupby(key)['purchase_amount'].agg({
    'purchase_amount_max@':'max',
    'purchase_amount_min@':'min',
    'purchase_amount_mean@':'mean',
    'purchase_amount_std@':'std',
    'purchase_amount_skew@':'skew',
})
amt_mean['purchase_amount_max-min@'] = amt_mean['purchase_amount_max@'] - amt_mean['purchase_amount_min@']

print(ins_mean.shape)
print(amt_mean.shape)

### date_agg->key_agg featureのJOIN&Save

In [57]:
base = utils.read_df_pkl('../input/base0*').set_index(key)
base = base.join(ins_mean, how='left')
base = base.join(amt_mean, how='left')
print(base.shape)

train_id = train[key].values
test_id = test[key].values
for col in base.columns:
    if col.count('@'):
#     if col.count('@') and col.count('max-min'):
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{col}')

100%|██████████| 3/3 [00:00<00:00, 85.74it/s]


(325540, 13)


### dateによる時系列考慮のfeature

まずは土台を作る. row_noをつける

In [7]:
%load_ext autoreload
%autoreload 2
import utils

# Parallel Groupby
# Split Key
idx = new[key].values
idx_list = np.array_split(idx, 10)

# Groupby args
level = key_date
feature = 'installments'
feature = ''
method = 'max'
df_latest = new.copy()

# def parallel_groupby(args):
#     if len(feature):
#         tmp = df.set_index(level).loc[args, :].reset_index().groupby(level)[feature].agg({f'{feature}_{method}@':f'{method}'})
#     else:
#         tmp = df.set_index(level).loc[args, :].reset_index().groupby(level).agg({f'{method}@':f'{method}'})
#     return tmp
# p_list = Parallel(n_jobs=-1)([delayed(parallel_groupby)(args) for args in idx_list])
# new2 = pd.concat(p_list, axis=0).reset_index()
df_latest.sort_values(by=[key, 'purchase_date', 'merchant_id'], ascending=False, inplace=True)
df_latest = utils.row_number(df=df_latest, level=key)
df_latest.head()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,diff_days_lag1@,diff_days_lag2@,diff_days_lag3@,ratio_days_lag1_2@,ratio_days_lag1_3@,ratio_days_lag2_3@,row_no
0,Y,C_ID_fffffd5772,20,N,1.0,B,367,M_ID_1208576963,1,-0.665765,2018-03-27 13:45:10,3.0,19,16,24.0,24.0,,1.0,,,1
1,Y,C_ID_fffffd5772,20,N,1.0,B,360,M_ID_c613085a2f,1,-0.72572,2018-03-02 18:14:05,3.0,19,34,0.0,,,,,,2
2,Y,C_ID_fffffd5772,20,N,1.0,B,705,M_ID_bbc64a7bd8,1,-0.733985,2018-03-02 16:23:47,3.0,19,33,,,,,,,3
3,Y,C_ID_ffff828181,126,N,3.0,C,884,M_ID_86c11f8b9c,2,-0.468166,2018-04-29 18:59:29,4.0,4,27,0.0,1.0,10.0,0.0,0.0,0.1,1
4,Y,C_ID_ffff828181,-1,N,,,705,M_ID_f0fab17fd6,2,-0.591383,2018-04-29 16:19:25,,-1,33,1.0,10.0,24.0,0.1,0.041667,0.416667,2


### Weighted Mean

In [8]:
base = utils.read_df_pkl('../input/base0*').set_index(key)
weight = 0.95

# Weighted Meanする特徴リスト
feat_list = [col for col in df_latest.columns if col.count('@')]
num_cols = ['installments', 'purchase_amount'] + feat_list

max_date = df_latest.groupby(key)['purchase_date'].max().reset_index().rename(columns={'purchase_date':'latest_date'})
df_w = df_latest.merge(max_date, how='inner', on=key)

df_w['diff_days'] = (df_w['latest_date'].map(lambda x: parse(x)) - df_w['purchase_date'].map(lambda x: parse(x))).map(lambda x: x.days)

for col in num_cols:
    col_name = f'W{weight}_{col}@'
    df_w['W'] = df_w['diff_days'].map(lambda x: np.power(weight, x))
    df_w[col_name] = df_w[col] * df_w['W']
    tmp = df_w.groupby(key)[col_name].sum() /  df_w.groupby(key)['W'].sum()
    tmp.name = col_name
    base = base.join(tmp, how='left')
    
train_id = train[key].values
test_id = test[key].values
for col in base.columns:
    if col.count('@'):
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{col}')
del df_w
gc.collect()

100%|██████████| 3/3 [00:00<00:00, 91.85it/s]


363

### New_Transaction内でTarget Encoding

In [9]:
%load_ext autoreload
%autoreload 2
from feature_engineering import target_encoding
new_target = df_latest.merge(train.reset_index()[[key, target]], how='left', on=key)
del df_latest
gc.collect()
try:
    if logger:
        pass
except NameError:
    logger=utils.logger_func()
    
# Target Encoding
tmp_train = new_target[~new_target[target].isnull()]
tmp_test = new_target[new_target[target].isnull()]
cat_col_list = ['city_id', 'category_1', 'category_2', 'category_3', 'merchant_category_id', 'state_id', 'subsector_id']
for cat_col in cat_col_list:
    train_te, test_te = target_encoding(logger=logger, train=tmp_train, test=tmp_test, key=key, target=target, level=cat_col, fold_type='group', group_col_name=key, ignore_list=ignore_list, return_df=False)
    new_target[f"TE1208_{cat_col}@"] = np.hstack((train_te, test_te))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2018-12-02 09:33:10,644 utils 366 [INFO]    [logger_func] start 
2018-12-02 09:33:11,527 utils 278 [INFO]    [target_encoding] Base Train Shape: (1219685, 2) 
2018-12-02 09:33:14,445 utils 305 [INFO]    [target_encoding] 
# COMPLETE TARGET ENCODING!!
# FEATURE     : TE_target@['city_id']
# BEFORE LEN  : Train1219685 / Test743346
# AFTER LEN   : Train1219685 / Test743346
2018-12-02 09:33:14,598 utils 278 [INFO]    [target_encoding] Base Train Shape: (1219685, 2) 
2018-12-02 09:33:17,613 utils 305 [INFO]    [target_encoding] 
# COMPLETE TARGET ENCODING!!
# FEATURE     : TE_target@['category_1']
# BEFORE LEN  : Train1219685 / Test743346
# AFTER LEN   : Train1219685 / Test743346
2018-12-02 09:33:17,773 utils 278 [INFO]    [target_encoding] Base Train Shape: (1219685, 2) 
2018-12-02 09:33:21,044 utils 305 [INFO]    [target_encoding] 
# COMPLETE TARGET ENCODING!!
# FEATURE     : TE_target@['category_2']
# BEFORE LEN  : Train1219685 / Test743346
# AFTER LEN   : Train1219685 / Test743346
2018-

In [10]:
new_target.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,diff_days_lag1@,diff_days_lag2@,diff_days_lag3@,ratio_days_lag1_2@,ratio_days_lag1_3@,ratio_days_lag2_3@,row_no,target,TE1208_city_id@,TE1208_category_1@,TE1208_category_2@,TE1208_category_3@,TE1208_merchant_category_id@,TE1208_state_id@,TE1208_subsector_id@
0,Y,C_ID_fffffd5772,20,N,1.0,B,367,M_ID_1208576963,1,-0.665765,2018-03-27 13:45:10,3.0,19,16,24.0,24.0,,1.0,,,1,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.500642,-0.553285,-0.504297
1,Y,C_ID_fffffd5772,20,N,1.0,B,360,M_ID_c613085a2f,1,-0.72572,2018-03-02 18:14:05,3.0,19,34,0.0,,,,,,2,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.570714,-0.553285,-0.529351
2,Y,C_ID_fffffd5772,20,N,1.0,B,705,M_ID_bbc64a7bd8,1,-0.733985,2018-03-02 16:23:47,3.0,19,33,,,,,,,3,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.574634,-0.553285,-0.579019
3,Y,C_ID_ffff828181,126,N,3.0,C,884,M_ID_86c11f8b9c,2,-0.468166,2018-04-29 18:59:29,4.0,4,27,0.0,1.0,10.0,0.0,0.0,0.1,1,-0.600063,-0.603525,-0.581304,-0.619456,-0.765396,-0.639078,-0.686108,-0.682186
4,Y,C_ID_ffff828181,-1,N,,,705,M_ID_f0fab17fd6,2,-0.591383,2018-04-29 16:19:25,,-1,33,1.0,10.0,24.0,0.1,0.041667,0.416667,2,-0.600063,-0.786583,-0.581304,-0.760234,-0.605882,-0.557002,-0.760234,-0.560775


### Target EncodingのWeighted Mean

In [11]:
base = utils.read_df_pkl('../input/base0*').set_index(key)
df = new_target
weight = 0.95

# Weighted Meanする特徴リスト
feat_list = [col for col in df.columns if col.count('@')]
num_cols = ['installments', 'purchase_amount'] + feat_list

max_date = df.groupby(key)['purchase_date'].max().reset_index().rename(columns={'purchase_date':'latest_date'})
df_w = df.merge(max_date, how='inner', on=key)

df_w['diff_days'] = (df_w['latest_date'].map(lambda x: parse(x)) - df_w['purchase_date'].map(lambda x: parse(x))).map(lambda x: x.days)

for col in num_cols:
    col_name = f'W{weight}_{col}@'
    df_w['W'] = df_w['diff_days'].map(lambda x: np.power(weight, x))
    df_w[col_name] = df_w[col] * df_w['W']
    tmp = df_w.groupby(key)[col_name].sum() /  df_w.groupby(key)['W'].sum()
    tmp.name = col_name
    base = base.join(tmp, how='left')
    
train_id = train[key].values
test_id = test[key].values
for col in base.columns:
    if col.count('@'):
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{col}')

100%|██████████| 3/3 [00:00<00:00, 97.42it/s]


### Latest Feature
row_no==1のfeature

In [64]:
feat_no = '103_new_'

In [24]:
num_list = ['purchase_amount', 'installments']
tmp = new_target.query("row_no==1")
base = utils.read_df_pkl('../input/base0*').set_index(key)
base = base.join(tmp.set_index(key).drop(target, axis=1), how='left')
prefix = 'latest1_'

train_id = train[key].values
test_id = test[key].values
for col in base.columns:
    if col.count('@') or col in num_list:
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{prefix}{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{prefix}{col}')

100%|██████████| 3/3 [00:00<00:00, 86.77it/s]


row_no<=3のfeature

In [58]:
tmp = new_target.query("row_no<=3")
num_list = ['purchase_amount', 'installments']
feat_cols = [col for col in tmp.columns if col.count('@')] + num_list
agg_mean = tmp[[key]+feat_cols].groupby(key).mean()
agg_std = tmp[[key]+feat_cols].groupby(key).std()
agg_max = tmp[[key]+feat_cols].groupby(key).max()
agg_min = tmp[[key]+feat_cols].groupby(key).min()
agg_mean.columns = [f"{col}_mean@" for col in agg_mean.columns]
agg_std.columns = [f"{col}_std@" for col in agg_std.columns]
agg_max.columns = [f"{col}_max@" for col in agg_max.columns]
agg_min.columns = [f"{col}_min@" for col in agg_min.columns]

base = utils.read_df_pkl('../input/base0*').set_index(key)
for tmp in [agg_mean, agg_std, agg_max, agg_min]:
    base = base.join(tmp, how='left')
for col in feat_cols:
    base[f"{col}_max-min@"] = base[f"{col}_max@"] - base[f"{col}_min@"]
prefix = 'latest3_'

train_id = train[key].values
test_id = test[key].values
for col in base.columns:
    if col.count('@'):
#     if col.count('@') and col.count('max-min'):
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{prefix}{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{prefix}{col}')

100%|██████████| 3/3 [00:00<00:00, 81.48it/s]


### row_noを逆転させ、初回の特徴、初回と2回目のdiffを特徴にする

In [19]:
df_first = new_target.drop('row_no', axis=1)
df_first.sort_values(by=[key, 'purchase_date', 'merchant_id'], ascending=True, inplace=True)
df_first = utils.row_number(df=df_first, level=key)
df_first.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,diff_days_lag1@,diff_days_lag2@,diff_days_lag3@,ratio_days_lag1_2@,ratio_days_lag1_3@,ratio_days_lag2_3@,target,TE1208_city_id@,TE1208_category_1@,TE1208_category_2@,TE1208_category_3@,TE1208_merchant_category_id@,TE1208_state_id@,TE1208_subsector_id@,row_no
0,Y,C_ID_00007093c1,76,N,1.0,B,222,M_ID_08f01305af,2,-0.671775,2018-04-03 11:13:35,3,2,21,,,,,,,0.134077,-0.544011,-0.583303,-0.591474,-0.527694,-0.574022,-0.581858,-0.575213,1
1,Y,C_ID_00007093c1,69,N,1.0,B,879,M_ID_00a6ca8a8a,2,-0.656749,2018-04-09 16:23:59,1,9,29,6.0,,,,,,0.134077,-0.544011,-0.583303,-0.591474,-0.527694,-0.502899,-0.581858,-0.505548,2
2,Y,C_ID_0001238066,314,N,1.0,B,307,M_ID_7d8102bb34,1,-0.732783,2018-03-01 16:48:27,1,9,19,,,,,,,,-0.609823,-0.583303,-0.591474,-0.527694,-0.575209,-0.581858,-0.575213,1
3,Y,C_ID_0001238066,314,N,1.0,B,367,M_ID_235e546dcc,1,-0.672136,2018-03-03 22:44:57,1,9,16,2.0,,,,,,,-0.582444,-0.583303,-0.591474,-0.765577,-0.620861,-0.581858,-0.631858,2
4,Y,C_ID_0001238066,333,N,1.0,B,783,M_ID_a88790a464,1,-0.641722,2018-03-04 13:05:16,1,9,19,0.0,2.0,,0.0,,,,-0.791251,-0.929266,-0.761306,-0.527694,-0.748654,-0.761306,-0.631858,3


### First Transaction
row_no==1(reverse)

In [26]:
tmp = df_first.query("row_no==1")
base = utils.read_df_pkl('../input/base0*').set_index(key)
base = base.join(tmp.set_index(key).drop(target, axis=1), how='left')
prefix = 'first1_'

train_id = train[key].values
test_id = test[key].values
for col in base.columns:
    if col.count('@'):
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{prefix}{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{prefix}{col}')

100%|██████████| 3/3 [00:00<00:00, 83.63it/s]


#### 初回と2回目、3回目の比率、差分

In [27]:
num_list = [key, 'purchase_amount', 'installments']
row1 = df_first.query("row_no==1")[num_list].set_index(key).rename(columns={'purchase_amount':'amt_1', 'installments':'ins_1'})
row2 = df_first.query("row_no==2")[num_list].set_index(key).rename(columns={'purchase_amount':'amt_2', 'installments':'ins_2'})
row3 = df_first.query("row_no==3")[num_list].set_index(key).rename(columns={'purchase_amount':'amt_3', 'installments':'ins_3'})

row123 = row1.join(row2).join(row3)
row123['purchase_amount_1_div_2@'] = row123['amt_1'] / row123['amt_2']
row123['purchase_amount_1_div_3@'] = row123['amt_1'] / row123['amt_3']
row123['purchase_amount_1_diff_2@'] = row123['amt_1'] - row123['amt_2']
row123['purchase_amount_1_diff_3@'] = row123['amt_1'] - row123['amt_3']
row123['installments_1_div_2@'] = row123['ins_1'] / row123['ins_2']
row123['installments_1_div_3@'] = row123['ins_1'] / row123['ins_3']
row123['installments_1_diff_2@'] = row123['ins_1'] - row123['ins_2']
row123['installments_1_diff_3@'] = row123['ins_1'] - row123['ins_3']

base = utils.read_df_pkl('../input/base0*').set_index(key)
base = base.join(row123, how='left')
prefix = 'first3_'

train_id = train[key].values
test_id = test[key].values
for col in base.columns:
    if col.count('@'):
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{prefix}{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{prefix}{col}')

100%|██████████| 3/3 [00:00<00:00, 93.06it/s]


### Nest Model

In [31]:
new_target.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,diff_days_lag1@,diff_days_lag2@,diff_days_lag3@,ratio_days_lag1_2@,ratio_days_lag1_3@,ratio_days_lag2_3@,row_no,target,TE1208_city_id@,TE1208_category_1@,TE1208_category_2@,TE1208_category_3@,TE1208_merchant_category_id@,TE1208_state_id@,TE1208_subsector_id@
0,Y,C_ID_fffffd5772,20,N,1.0,B,367,M_ID_1208576963,1,-0.665765,2018-03-27 13:45:10,3.0,19,16,24.0,24.0,,1.0,,,1,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.500642,-0.553285,-0.504297
1,Y,C_ID_fffffd5772,20,N,1.0,B,360,M_ID_c613085a2f,1,-0.72572,2018-03-02 18:14:05,3.0,19,34,0.0,,,,,,2,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.570714,-0.553285,-0.529351
2,Y,C_ID_fffffd5772,20,N,1.0,B,705,M_ID_bbc64a7bd8,1,-0.733985,2018-03-02 16:23:47,3.0,19,33,,,,,,,3,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.574634,-0.553285,-0.579019
3,Y,C_ID_ffff828181,126,N,3.0,C,884,M_ID_86c11f8b9c,2,-0.468166,2018-04-29 18:59:29,4.0,4,27,0.0,1.0,10.0,0.0,0.0,0.1,1,-0.600063,-0.603525,-0.581304,-0.619456,-0.765396,-0.639078,-0.686108,-0.682186
4,Y,C_ID_ffff828181,-1,N,,,705,M_ID_f0fab17fd6,2,-0.591383,2018-04-29 16:19:25,,-1,33,1.0,10.0,24.0,0.1,0.041667,0.416667,2,-0.600063,-0.786583,-0.581304,-0.760234,-0.605882,-0.557002,-0.760234,-0.560775


#### TEを含んだデータセット

In [137]:
train = new_target[~new_target[target].isnull()]
test = new_target[new_target[target].isnull()]
display(train.head())

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,diff_days_lag1@,diff_days_lag2@,diff_days_lag3@,ratio_days_lag1_2@,ratio_days_lag1_3@,ratio_days_lag2_3@,row_no,target,TE1208_city_id@,TE1208_category_1@,TE1208_category_2@,TE1208_category_3@,TE1208_merchant_category_id@,TE1208_state_id@,TE1208_subsector_id@
0,Y,C_ID_fffffd5772,20,N,1.0,B,367,M_ID_1208576963,1,-0.665765,2018-03-27 13:45:10,3.0,19,16,24.0,24.0,,1.0,,,1,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.500642,-0.553285,-0.504297
1,Y,C_ID_fffffd5772,20,N,1.0,B,360,M_ID_c613085a2f,1,-0.72572,2018-03-02 18:14:05,3.0,19,34,0.0,,,,,,2,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.570714,-0.553285,-0.529351
2,Y,C_ID_fffffd5772,20,N,1.0,B,705,M_ID_bbc64a7bd8,1,-0.733985,2018-03-02 16:23:47,3.0,19,33,,,,,,,3,-1.073077,-0.562308,-0.58221,-0.532668,-0.528903,-0.574634,-0.553285,-0.579019
3,Y,C_ID_ffff828181,126,N,3.0,C,884,M_ID_86c11f8b9c,2,-0.468166,2018-04-29 18:59:29,4.0,4,27,0.0,1.0,10.0,0.0,0.0,0.1,1,-0.600063,-0.603525,-0.581304,-0.619456,-0.765396,-0.639078,-0.686108,-0.682186
4,Y,C_ID_ffff828181,-1,N,,,705,M_ID_f0fab17fd6,2,-0.591383,2018-04-29 16:19:25,,-1,33,1.0,10.0,24.0,0.1,0.041667,0.416667,2,-0.600063,-0.786583,-0.581304,-0.760234,-0.605882,-0.557002,-0.760234,-0.560775


#### TEを含まないデータセット

In [139]:
not_te_cols = [col for col in train.columns if not(col.count('TE'))]
train = train[not_te_cols]
test = test[not_te_cols]

In [148]:
feat_no = '106_new_'
base = utils.read_df_pkl('../input/base0*').set_index(key)
base = base.join(stack_agg, how='left').join(stack_noTE_agg, how='left')
prefix = ''

train_id = elo_load_data('train')[key].values
test_id = elo_load_data('test')[key].values
for col in base.columns:
    if col.count('@'):
        utils.to_pkl_gzip(obj = base.loc[train_id, :][col].values, path=f'../features/1_first_valid/{feat_no}train_{prefix}{col}')
        utils.to_pkl_gzip(obj = base.loc[test_id, :][col].values, path=f'../features/1_first_valid/{feat_no}test_{prefix}{col}')

100%|██████████| 3/3 [00:00<00:00, 79.32it/s]
100%|██████████| 3/3 [00:00<00:00, 87.25it/s]
100%|██████████| 3/3 [00:00<00:00, 130.68it/s]
