In [1]:
is_stack = [True, False][0]
debug = False
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
import numpy as np
import os
import sys
import time
import datetime
import glob
sys.path.append('../py/')
from s027_kfold_ods import ods_kfold
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature, impute_feature
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

#========================================================================
# Keras 
# Corporación Favorita Grocery Sales Forecasting
from sklearn.linear_model import Ridge
#========================================================================


#========================================================================
# Args
out_part = ['', 'part', 'all'][0]
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg']
stack_name='ridge'
submit = pd.read_csv('../input/sample_submission.csv')
model_type='ridge'
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
#========================================================================

#========================================================================
# Data Load 
def get_stack_dataset(lgb_path='', is_clf_out=False, is_no_out_flg=False, is_rm_out=False, is_binary=False, is_nn=False, is_rmf=False, is_ext=False, is_rid=False, is_random=False, seed=328):
    print("Preparing dataset...")
    base = utils.read_df_pkl('../input/base_no_out_clf.gz').set_index(key)
    
    #========================================================================
    # Base Model Path
    #========================================================================
    # Clf Out Model
    if is_clf_out: ens_list = glob.glob('../ensemble/clf_min_thres_ensemble/*.gz')
    # No Out Flg Model
    elif is_no_out_flg: ens_list = glob.glob('../no_out_flg_ensemble/*.gz')
    elif is_rm_out: ens_list = glob.glob('../ensemble/rm_outlier_ensemble/*.gz')
    elif is_binary:
        model_type='lgr'
        lgb_list = glob.glob('../stack/*binary*.gz')
        nn_list = []
        ens_list = lgb_list + nn_list
    #========================================================================
    # Base Model
    else:
        if is_random:
            np.random.seed(seed)
            lgb_list = list(np.random.choice(lgb_list, 10))
#             nn_list = list(np.random.choice(nn_list, 1))
        nn_list = []
        rid_list = []
        ext_list = []
        rmf_list = []
        if is_nn : nn_list = glob.glob('../ensemble/NN_ensemble/*CV3*.gz')
        if is_rmf: rmf_list = glob.glob('../ensemble/various_model/*rmf*.gz')
        if is_ext: ext_list = glob.glob('../ensemble/various_model/*ext*.gz')
        if is_rid: rid_list = glob.glob('../ensemble/various_model/*ridge*.gz')
        lgb_list = glob.glob(lgb_path)
        ens_list = lgb_list + nn_list + rid_list + rmf_list + ext_list
    
    #========================================================================
    # Stack Models Load
    from joblib import Parallel, delayed
    def parallel_stack_model(model_path):
        try:
            cv = re.search(r'CV([^/.]*)_LB.gz', model_path).group(1)
        except AttributeError:
            cv = re.search(r'CV([^/.]*).gz', model_path.replace('.', '-')).group(1)
        tmp = utils.read_pkl_gzip(model_path)
        if key not in tmp.columns:
            tmp.reset_index(inplace=True)
        if 'pred_mean' in tmp.columns:
            tmp = tmp[[key, 'pred_mean']]
        else:
            tmp = tmp[[key, 'prediction']]
            
        if model_path.count('lgb'):
            tmp.columns = [key, f"base_lgb_{cv}"]
        elif model_path.count('NN'):
            tmp.columns = [key, f"base_NN_{cv}"]
        elif model_path.count('ridge'):
            tmp.columns = [key, f"base_ridge_{cv}"]
        elif model_path.count('rmf'):
            tmp.columns = [key, f"base_rmf_{cv}"]
        elif model_path.count('ext'):
            tmp.columns = [key, f"base_ext_{cv}"]
        else:
            tmp.columns = [key, f"base_model_{cv}"]
        return tmp.set_index(key)
    #========================================================================
    
    p_list = Parallel(n_jobs=-1)([delayed(parallel_stack_model)(model_path) for model_path in ens_list])
    df_pred = pd.concat(p_list, axis=1)
    if is_rm_out:
        cv15 = [col for col in df_pred.columns if col.count('1-5')]
        cv8 = [col for col in df_pred.columns if col.count('8-')]
        df_pred['tmp_mean'] = df_pred[cv8].mean(axis=1).values
        for col in cv15:
            df_pred.loc[df_pred[col].isnull(), col] = df_pred.loc[df_pred[col].isnull(), 'tmp_mean']
    base = base.join(df_pred)
    
    #========================================================================
    
    if key in base.columns:
        train = base[~base[target].isnull()]
        test = base[base[target].isnull()]
    else:
        train = base[~base[target].isnull()].reset_index()
        test = base[base[target].isnull()].reset_index()
    
    if is_rm_out:
        train = train[~train[target].isnull()]
    elif is_clf_out:
        train = train[train['clf_pred']<0.01]
        test = test[test['clf_pred']<0.01]
    elif is_binary:
        train[target] = train[target].map(lambda x: 1 if x<-30 else 0)
        
    display(train.head())
    
    return train, test

2019-02-17 15:59:13,238 utils 400 [INFO]    [logger_func] start 


In [7]:
from sklearn.linear_model import LogisticRegression

valid_type = ['ods', 'pmo' ,'pm' ,'term'][3]
lgb_path = '../ensemble/pmo_all_stack_level1/*.gz'
lgb_path = '../ensemble/good_submit_ensemble/0215_083_stack_submit_OUT29-7593_CV3-61266_LB3-664.gz'
# lgb_path = '../ensemble/dir_stack_blend/tmp/*_lgb_*.gz'
#========================================================================
# Make Dataset 
is_clf_out = [True, False][1]
is_no_out_flg = [True, False][1]
is_rm_out = [True, False][1]
is_binary = [True, False][1]
is_nn = 0
is_rid = 0
is_rmf = 0
is_ext = 0
is_random = 0
seed_size = 1
#========================================================================
    
#========================================================================
# CVの準備
seed = 328
fold_seed = 328
fold_seed = 1208
seed_list = [328, 1208]
fold = 6

if is_rm_out:
    set_type = 'rm_out'
else:
    set_type = 'all'

#========================================================================
# Dataset
submit = pd.read_csv('../input/sample_submission.csv').set_index(key)
result_list = []
score_list = []
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg', 'clf_pred']
#========================================================================
    
train, test = get_stack_dataset(lgb_path=lgb_path, is_rmf=is_rmf, is_ext=is_ext, is_random=is_random, seed=seed)

  0%|          | 0/1 [00:00<?, ?it/s]

Preparing dataset...


100%|██████████| 1/1 [00:00<00:00,  2.83it/s]


Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,new_purchase_month_min,hist_personal_term,new_personal_term,hist_regist_term,new_regist_term,no_out_flg,clf_pred,base_model_3-61266_LB3-664
0,C_ID_92a2005557,-0.820283,2017-06-01,2018-02-25 09:31:15,2018-03-01,2017-06-27 14:18:08,2017-06-01,2018-04-29 11:23:05,2018-05-01,2018-03-05 14:04:36,2018-03-01,9,2.0,9,11.0,1.0,0.000444,-0.343964
1,C_ID_3d0044924f,0.392913,2017-01-01,2018-01-31 22:31:09,2018-02-01,2017-01-06 16:29:42,2017-01-01,2018-03-30 06:48:26,2018-04-01,2018-02-01 17:07:54,2018-02-01,13,2.0,13,15.0,0.0,0.007831,0.226026
2,C_ID_d639edf6cd,0.688056,2016-08-01,2018-02-27 19:08:25,2018-03-01,2017-01-11 08:21:22,2017-01-01,2018-04-28 17:43:11,2018-05-01,2018-04-28 17:43:11,2018-04-01,14,1.0,19,18.0,0.0,0.004074,0.750429
3,C_ID_186d6a6901,0.142495,2017-09-01,2018-02-28 11:44:40,2018-03-01,2017-09-26 16:22:21,2017-09-01,2018-04-18 11:00:11,2018-05-01,2018-03-07 11:55:06,2018-03-01,6,2.0,6,8.0,0.0,0.000797,0.193548
4,C_ID_cdbd2c0db2,-0.159749,2017-11-01,2018-02-28 20:40:41,2018-03-01,2017-11-12 00:00:00,2017-11-01,2018-04-28 18:50:25,2018-05-01,2018-03-02 11:55:43,2018-03-01,4,2.0,4,6.0,1.0,0.000251,-0.258822


In [18]:
# tmp = train[train[target]<-30]
tmp[tmp[target]>-9]

Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,new_purchase_month_min,hist_personal_term,new_personal_term,hist_regist_term,new_regist_term,no_out_flg,clf_pred,base_model_3-61266_LB3-664


In [42]:
id_list = utils.read_pkl_gzip('../input/0217_merchant_two_id_list.gz')
# base_col = [col for col in train.columns if col.count('base_')]
# display(train.loc[train[key].isin(id_list), base_col][base_col[0]].map(lambda x: np.round(x, 1)).value_counts())
# display(test.loc[test[key].isin(id_list), base_col][base_col[0]].map(lambda x: np.round(x, 1)).value_counts())

# display(train.loc[train[key].isin(id_list), target].map(lambda x: np.round(x, 1)).value_counts())
tmp = train.loc[train[key].isin(id_list), [key, target]]
id_list = tmp[tmp[target]<-30][key].values

train.loc[train[key].isin(id_list), :]

Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,new_purchase_month_min,hist_personal_term,new_personal_term,hist_regist_term,new_regist_term,no_out_flg,clf_pred,base_model_3-61266_LB3-664
2422,C_ID_bcda954211,-33.219281,2015-10-01,2017-06-25 19:09:57,2017-07-01,2017-01-01 22:35:22,2017-01-01,NaT,NaT,NaT,NaT,6,,21,,0.0,0.075255,-2.979227
10610,C_ID_f59cca7717,-33.219281,2015-08-01,2017-06-10 08:36:50,2017-07-01,2017-02-03 13:51:16,2017-02-01,2017-08-19 10:09:01,2017-09-01,2017-07-17 13:03:52,2017-07-01,5,2.0,23,24.0,0.0,0.033305,-2.359165
11498,C_ID_2dd54c3384,-33.219281,2016-09-01,2017-11-27 23:31:21,2017-12-01,2017-01-04 15:42:55,2017-01-01,NaT,NaT,NaT,NaT,11,,15,,0.0,0.115252,-6.898462
21347,C_ID_ae23cf351b,-33.219281,2014-06-01,2017-05-26 11:56:41,2017-06-01,2017-01-11 13:05:56,2017-01-01,2017-12-29 13:15:24,2018-01-01,2017-12-29 13:15:24,2017-12-01,5,1.0,24,24.0,0.0,0.018994,-0.619401
28846,C_ID_678ba2338f,-33.219281,2016-12-01,2017-12-27 17:21:23,2018-01-01,2017-01-11 17:26:48,2017-01-01,NaT,NaT,NaT,NaT,12,,13,,0.0,0.022127,0.08247
42539,C_ID_55b3b206c7,-33.219281,2017-08-01,2017-12-13 10:47:22,2018-01-01,2017-08-18 17:08:28,2017-08-01,NaT,NaT,NaT,NaT,5,,5,,0.0,0.019082,-0.650166
47892,C_ID_554055d28a,-33.219281,2017-01-01,2017-07-14 12:51:38,2017-08-01,2017-01-07 13:56:57,2017-01-01,2017-09-15 17:05:29,2017-10-01,2017-09-15 17:05:29,2017-09-01,7,1.0,7,9.0,0.0,0.026089,-3.172241
82955,C_ID_e66591e63d,-33.219281,2014-10-01,2017-10-23 07:57:25,2017-11-01,2017-01-17 09:45:58,2017-01-01,NaT,NaT,NaT,NaT,10,,24,,0.0,0.134987,-6.513112
84502,C_ID_7b5822e4c5,-33.219281,2017-07-01,2017-12-01 10:45:12,2018-01-01,2017-07-30 10:57:12,2017-07-01,2018-02-08 17:28:04,2018-03-01,2018-01-14 12:28:39,2018-01-01,6,2.0,6,8.0,0.0,0.010856,-1.001155
84629,C_ID_98d0585bba,-33.219281,2016-08-01,2017-12-29 10:03:34,2018-01-01,2017-01-02 07:28:06,2017-01-01,NaT,NaT,NaT,NaT,12,,17,,0.0,0.091907,-5.437577


In [None]:
df_hist[df_hist[key].isin()]

In [20]:
df_hist = pd.read_csv('../input/historical_transactions.csv')
df_new = pd.read_csv('../input/new_merchant_transactions.csv')
df_hist['purchase_amount_new'] = np.round(df_hist['purchase_amount'] / 0.00150265118 + 497.06, 2)
df_new['purchase_amount_new'] = np.round(df_new['purchase_amount'] / 0.00150265118 + 497.06, 2)

In [44]:
base_col

['base_model_3-61266_LB3-664']

In [49]:
pd.set_option('max_row', 500)
tmp = df_hist.loc[df_hist[key].isin(id_list), :].sort_values(by=[key, 'purchase_date'])
tmp.set_index([key, 'purchase_date'], inplace=True)
tmp.join(train.set_index([key])[[target, 'clf_pred', base_col[0]]])

Unnamed: 0_level_0,Unnamed: 1_level_0,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,category_2,state_id,subsector_id,purchase_amount_new,target,clf_pred,base_model_3-61266_LB3-664
card_id,purchase_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
C_ID_2dd54c3384,2017-01-04 15:42:55,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.695878,,-1,7,33.96,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-04 17:05:43,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.745405,,-1,7,1.0,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-04 17:08:16,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.745405,,-1,7,1.0,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-04 17:34:32,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.726186,,-1,7,13.79,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-05 01:01:44,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.745405,,-1,7,1.0,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-05 01:16:24,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.735999,,-1,7,7.26,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-05 14:49:40,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.745405,,-1,7,1.0,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-05 15:08:26,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.734992,,-1,7,7.93,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-05 19:31:29,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.745405,,-1,7,1.0,-33.219281,0.115252,-6.898462
C_ID_2dd54c3384,2017-01-05 19:50:15,Y,-1,Y,1,B,511,M_ID_b9dcf28cb9,-11,-0.734346,,-1,7,8.36,-33.219281,0.115252,-6.898462


In [50]:
df_new[df_new[key]=='C_ID_554055d28a']

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,purchase_amount_new
1636181,Y,C_ID_554055d28a,271,N,5,C,222,M_ID_9965b98fe1,2,6.599554,2017-09-15 17:05:29,1.0,9,21,4889.0
