In [1]:
is_stack = [True, False][0]
debug = False
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
import numpy as np
import os
import sys
import time
import datetime
import glob
sys.path.append('../py/')
from s027_kfold_ods import ods_kfold
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature, impute_feature
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

#========================================================================
# Keras 
# Corporación Favorita Grocery Sales Forecasting
from sklearn.linear_model import Ridge
#========================================================================


#========================================================================
# Args
out_part = ['', 'part', 'all'][0]
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg']
stack_name='ridge'
submit = pd.read_csv('../input/sample_submission.csv')
model_type='ridge'
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
#========================================================================

#========================================================================
# Data Load 
def get_stack_dataset(lgb_path='', is_clf_out=False, is_no_out_flg=False, is_rm_out=False, is_binary=False, is_nn=False, is_rmf=False, is_ext=False, is_rid=False, is_random=False, seed=328):
    print("Preparing dataset...")
    base = utils.read_df_pkl('../input/base_no_out_clf.gz').set_index(key)
    
    #========================================================================
    # Base Model Path
    #========================================================================
    # Clf Out Model
    if is_clf_out: ens_list = glob.glob('../ensemble/clf_min_thres_ensemble/*.gz')
    # No Out Flg Model
    elif is_no_out_flg: ens_list = glob.glob('../no_out_flg_ensemble/*.gz')
    elif is_rm_out: ens_list = glob.glob('../ensemble/rm_outlier_ensemble/*.gz')
    elif is_binary:
        model_type='lgr'
        lgb_list = glob.glob('../stack/*binary*.gz')
        nn_list = []
        ens_list = lgb_list + nn_list
    #========================================================================
    # Base Model
    else:
        if is_random:
            np.random.seed(seed)
            lgb_list = list(np.random.choice(lgb_list, 10))
#             nn_list = list(np.random.choice(nn_list, 1))
        nn_list = []
        rid_list = []
        ext_list = []
        rmf_list = []
        if is_nn : nn_list = glob.glob('../ensemble/NN_ensemble/*CV3*.gz')
        if is_rmf: rmf_list = glob.glob('../ensemble/various_model/*rmf*.gz')
        if is_ext: ext_list = glob.glob('../ensemble/various_model/*ext*.gz')
        if is_rid: rid_list = glob.glob('../ensemble/various_model/*ridge*.gz')
        lgb_list = glob.glob(lgb_path)
        ens_list = lgb_list + nn_list + rid_list + rmf_list + ext_list
    
    #========================================================================
    # Stack Models Load
    from joblib import Parallel, delayed
    def parallel_stack_model(model_path):
        try:
            cv = re.search(r'CV([^/.]*)_LB.gz', model_path).group(1)
        except AttributeError:
            cv = re.search(r'CV([^/.]*).gz', model_path.replace('.', '-')).group(1)
        tmp = utils.read_pkl_gzip(model_path)
        if key not in tmp.columns:
            tmp.reset_index(inplace=True)
        if 'pred_mean' in tmp.columns:
            tmp = tmp[[key, 'pred_mean']]
        else:
            tmp = tmp[[key, 'prediction']]
            
        if model_path.count('lgb'):
            tmp.columns = [key, f"base_lgb_{cv}"]
        elif model_path.count('NN'):
            tmp.columns = [key, f"base_NN_{cv}"]
        elif model_path.count('ridge'):
            tmp.columns = [key, f"base_ridge_{cv}"]
        elif model_path.count('rmf'):
            tmp.columns = [key, f"base_rmf_{cv}"]
        elif model_path.count('ext'):
            tmp.columns = [key, f"base_ext_{cv}"]
        else:
            tmp.columns = [key, f"base_model_{cv}"]
        return tmp.set_index(key)
    #========================================================================
    
    p_list = Parallel(n_jobs=-1)([delayed(parallel_stack_model)(model_path) for model_path in ens_list])
    df_pred = pd.concat(p_list, axis=1)
    if is_rm_out:
        cv15 = [col for col in df_pred.columns if col.count('1-5')]
        cv8 = [col for col in df_pred.columns if col.count('8-')]
        df_pred['tmp_mean'] = df_pred[cv8].mean(axis=1).values
        for col in cv15:
            df_pred.loc[df_pred[col].isnull(), col] = df_pred.loc[df_pred[col].isnull(), 'tmp_mean']
    base = base.join(df_pred)
    
    #========================================================================
    
    if key in base.columns:
        train = base[~base[target].isnull()]
        test = base[base[target].isnull()]
    else:
        train = base[~base[target].isnull()].reset_index()
        test = base[base[target].isnull()].reset_index()
    
    if is_rm_out:
        train = train[~train[target].isnull()]
    elif is_clf_out:
        train = train[train['clf_pred']<0.01]
        test = test[test['clf_pred']<0.01]
    elif is_binary:
        train[target] = train[target].map(lambda x: 1 if x<-30 else 0)
        
    display(train.head())
    
    return train, test

2019-02-18 10:06:21,437 utils 400 [INFO]    [logger_func] start 


In [11]:
from sklearn.linear_model import LogisticRegression

valid_type = ['ods', 'pmo' ,'pm' ,'term'][3]
lgb_path = '../ensemble/pmo_all_stack_level1/*.gz'
lgb_path = '../ensemble/good_submit_ensemble/0215_083_stack_submit_OUT29-7593_CV3-61266_LB3-664.gz'
# lgb_path = '../ensemble/dir_stack_blend/tmp/*_lgb_*.gz'
#========================================================================
# Make Dataset 
is_clf_out = [True, False][1]
is_no_out_flg = [True, False][1]
is_rm_out = [True, False][1]
is_binary = [True, False][1]
is_nn = 0
is_rid = 0
is_rmf = 0
is_ext = 0
is_random = 0
seed_size = 1
#========================================================================
    
#========================================================================
# CVの準備
seed = 328
fold_seed = 328
fold_seed = 1208
seed_list = [328, 1208]
fold = 6

if is_rm_out:
    set_type = 'rm_out'
else:
    set_type = 'all'

#========================================================================
# Dataset
submit = pd.read_csv('../input/sample_submission.csv').set_index(key)
result_list = []
score_list = []
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg', 'clf_pred']
#========================================================================
    
train, test = get_stack_dataset(lgb_path=lgb_path, is_rmf=is_rmf, is_ext=is_ext, is_random=is_random, seed=seed)

  0%|          | 0/1 [00:00<?, ?it/s]

Preparing dataset...


100%|██████████| 1/1 [00:00<00:00,  3.97it/s]


Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,new_purchase_month_min,hist_personal_term,new_personal_term,hist_regist_term,new_regist_term,no_out_flg,clf_pred,base_model_3-61266_LB3-664
0,C_ID_92a2005557,-0.820283,2017-06-01,2018-02-25 09:31:15,2018-03-01,2017-06-27 14:18:08,2017-06-01,2018-04-29 11:23:05,2018-05-01,2018-03-05 14:04:36,2018-03-01,9,2.0,9,11.0,1.0,0.000444,-0.343964
1,C_ID_3d0044924f,0.392913,2017-01-01,2018-01-31 22:31:09,2018-02-01,2017-01-06 16:29:42,2017-01-01,2018-03-30 06:48:26,2018-04-01,2018-02-01 17:07:54,2018-02-01,13,2.0,13,15.0,0.0,0.007831,0.226026
2,C_ID_d639edf6cd,0.688056,2016-08-01,2018-02-27 19:08:25,2018-03-01,2017-01-11 08:21:22,2017-01-01,2018-04-28 17:43:11,2018-05-01,2018-04-28 17:43:11,2018-04-01,14,1.0,19,18.0,0.0,0.004074,0.750429
3,C_ID_186d6a6901,0.142495,2017-09-01,2018-02-28 11:44:40,2018-03-01,2017-09-26 16:22:21,2017-09-01,2018-04-18 11:00:11,2018-05-01,2018-03-07 11:55:06,2018-03-01,6,2.0,6,8.0,0.0,0.000797,0.193548
4,C_ID_cdbd2c0db2,-0.159749,2017-11-01,2018-02-28 20:40:41,2018-03-01,2017-11-12 00:00:00,2017-11-01,2018-04-28 18:50:25,2018-05-01,2018-03-02 11:55:43,2018-03-01,4,2.0,4,6.0,1.0,0.000251,-0.258822


In [39]:
# df_hist = pd.read_csv('../input/historical_transactions.csv')
# df_hist['purchase_amount_new'] = np.round(df_hist['purchase_amount'] / 0.00150265118 + 497.06, 2)
# df_new = pd.read_csv('../input/new_merchant_transactions.csv')
# df_new['purchase_amount_new'] = np.round(df_new['purchase_amount'] / 0.00150265118 + 497.06, 2)

# del df_hist
# gc.collect()

977

In [7]:
id_list = utils.read_pkl_gzip('../input/0217_merchant_one_id_list.gz')
# display(train.loc[train[key].isin(id_list), base_col][base_col[0]].map(lambda x: np.round(x, 1)).value_counts())
# display(test.loc[test[key].isin(id_list), base_col][base_col[0]].map(lambda x: np.round(x, 1)).value_counts())
# display(train.loc[train[key].isin(id_list), target].map(lambda x: np.round(x, 1)).value_counts())
tmp = train.loc[train[key].isin(id_list), [key, target]]
# id_list = tmp[tmp[target]<-30][key].values

train.loc[train[key].isin(id_list), :]

Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,new_purchase_month_min,hist_personal_term,new_personal_term,hist_regist_term,new_regist_term,no_out_flg,clf_pred,base_model_3-61266_LB3-664
175,C_ID_7decdf7eec,0.000000,2016-04-01,2018-02-24 05:32:54,2018-03-01,2017-01-24 07:04:52,2017-01-01,NaT,NaT,NaT,NaT,14,,23,,0.0,0.002881,0.147969
818,C_ID_fba4a71df1,0.135437,2015-11-01,2018-02-18 06:16:55,2018-03-01,2017-01-16 07:32:14,2017-01-01,NaT,NaT,NaT,NaT,14,,24,,0.0,0.002387,0.133502
1100,C_ID_dc8b95d37f,0.000000,2016-02-01,2017-07-06 07:20:23,2017-08-01,2017-01-06 07:03:40,2017-01-01,NaT,NaT,NaT,NaT,7,,18,,0.0,0.029440,-0.365080
2061,C_ID_716e065493,-0.487497,2017-10-01,2018-02-26 08:06:27,2018-03-01,2017-09-25 19:45:46,2017-09-01,NaT,NaT,NaT,NaT,6,,5,,0.0,0.002798,0.134141
3442,C_ID_460aa35ea2,0.000000,2015-12-01,2017-07-05 06:40:50,2017-08-01,2017-01-05 07:26:19,2017-01-01,NaT,NaT,NaT,NaT,7,,20,,0.0,0.025907,-0.728847
3485,C_ID_5da6294302,0.000000,2016-06-01,2018-02-12 09:10:03,2018-03-01,2017-01-29 08:57:40,2017-01-01,NaT,NaT,NaT,NaT,14,,21,,0.0,0.002430,0.098572
4061,C_ID_9000088bfa,0.149481,2016-03-01,2017-03-18 11:17:40,2017-04-01,2017-01-16 08:21:02,2017-01-01,NaT,NaT,NaT,NaT,3,,13,,1.0,0.000605,0.141312
4089,C_ID_c94316139b,0.000000,2016-09-01,2017-08-21 09:15:29,2017-09-01,2017-01-06 09:21:00,2017-01-01,NaT,NaT,NaT,NaT,8,,12,,0.0,0.048857,-0.721559
4148,C_ID_1a4fbf387c,0.000000,2017-11-01,2018-02-26 08:33:02,2018-03-01,2017-11-26 15:50:14,2017-11-01,NaT,NaT,NaT,NaT,4,,4,,1.0,0.000441,0.143168
4712,C_ID_909bc065d5,0.000000,2017-09-01,2018-02-09 07:18:27,2018-03-01,2017-09-09 23:22:00,2017-09-01,NaT,NaT,NaT,NaT,6,,6,,0.0,0.003234,0.146546


In [37]:
pd.set_option('max_row', 500)
tmp = df_hist.loc[df_hist[key].isin(id_list), :]
train['raw_target'] = 2**train[target]
base_col = [col for col in train.columns if col.count('base_')]
# tmp[tmp[target]>3]
tmp.set_index([key, 'purchase_date'], inplace=True)
tmp = tmp.join(train.set_index([key])[[target, 'raw_target', 'clf_pred', base_col[0]]])
tmp.sort_values(by=[target], ascending=True, inplace=True)
tmp.sort_values(by=[key, 'purchase_date'], ascending=True, inplace=True)

In [38]:
viz_cols = ['authorized_flag', 'installments', 'merchant_id', 'month_lag', 'purchase_amount_new', target, 'raw_target', 'clf_pred', base_col[0]]
tmp = tmp[~tmp[target].isnull()][viz_cols]
tmp['diff'] = np.abs(tmp[target] - tmp[base_col[0]])
tmp[tmp['diff']>2]

Unnamed: 0_level_0,Unnamed: 1_level_0,authorized_flag,installments,merchant_id,month_lag,purchase_amount_new,target,raw_target,clf_pred,base_model_3-61266_LB3-664,diff
card_id,purchase_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C_ID_039afe8efd,2017-01-21 08:53:30,Y,1,M_ID_fc7d7969c3,-11,22.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-02-21 07:58:54,Y,1,M_ID_fc7d7969c3,-10,22.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-03-21 10:13:38,Y,1,M_ID_fc7d7969c3,-9,22.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-04-21 07:10:57,Y,1,M_ID_fc7d7969c3,-8,22.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-05-21 07:07:24,Y,1,M_ID_fc7d7969c3,-7,22.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-06-21 07:09:02,Y,1,M_ID_fc7d7969c3,-6,22.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-07-21 08:50:59,Y,1,M_ID_fc7d7969c3,-5,27.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-08-21 14:23:35,Y,1,M_ID_fc7d7969c3,-4,27.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-09-21 11:47:02,Y,1,M_ID_fc7d7969c3,-3,27.9,0.0,1.0,0.052795,-3.655322,3.655322
C_ID_039afe8efd,2017-10-21 09:12:55,Y,1,M_ID_fc7d7969c3,-2,27.9,0.0,1.0,0.052795,-3.655322,3.655322


In [21]:
df_hist[df_hist[key]=='C_ID_a2dc8471b4']
# df_new[df_new[key]=='C_ID_0fb0e3b5cb']

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,purchase_amount_new
