In [34]:
is_stack = [True, False][0]
debug = False
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
import numpy as np
import os
import sys
import time
import datetime
import glob
sys.path.append('../py/')
from s027_kfold_ods import ods_kfold
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature, impute_feature
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

#========================================================================
# Keras 
# Corporación Favorita Grocery Sales Forecasting
from sklearn.linear_model import Ridge
#========================================================================


#========================================================================
# Args
out_part = ['', 'part', 'all'][0]
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg']
stack_name='ridge'
submit = pd.read_csv('../input/sample_submission.csv')
model_type='ridge'
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
seed = 328
#========================================================================

#========================================================================
# Data Load 
def get_stack_dataset(lgb_path='', is_clf_out=False, is_no_out_flg=False, is_rm_out=False, is_binary=False, is_nn=False, is_rmf=False, is_ext=False, is_rid=False, is_random=False, seed=seed):
#     print("Preparing dataset...")
    base = utils.read_df_pkl('../input/base_no_out_clf.gz', use_tqdm=False).set_index(key)
    
    #========================================================================
    # Base Model Path
    #========================================================================
    # Clf Out Model
    if is_clf_out: ens_list = glob.glob('../ensemble/clf_min_thres_ensemble/*.gz')
    # No Out Flg Model
    elif is_no_out_flg: ens_list = glob.glob('../no_out_flg_ensemble/*.gz')
    elif is_rm_out: ens_list = glob.glob('../ensemble/rm_outlier_ensemble/*.gz')
    elif is_binary:
        model_type='lgr'
        lgb_list = glob.glob('../stack/*binary*.gz')
        nn_list = []
        ens_list = lgb_list + nn_list
    #========================================================================
    # Base Model
    else:
        if is_random:
            np.random.seed(seed)
            lgb_list = list(np.random.choice(lgb_list, 10))
#             nn_list = list(np.random.choice(nn_list, 1))
        nn_list = []
        rid_list = []
        ext_list = []
        rmf_list = []
        if is_nn : nn_list = glob.glob('../ensemble/NN_ensemble/*.gz')
        if is_rmf: rmf_list = glob.glob('../ensemble/various_model/*rmf*.gz')
        if is_ext: ext_list = glob.glob('../ensemble/various_model/*ext*.gz')
        if is_rid: rid_list = glob.glob('../ensemble/various_model/*ridge*.gz')
        lgb_list = glob.glob(lgb_path)
#         lgb_list = glob.glob(lgb_path) + glob.glob('../ensemble/rm_outlier_ensemble/tmp/*.gz')
        ens_list = lgb_list + nn_list + rid_list + rmf_list + ext_list
    
    #========================================================================
    # Stack Models Load
    from joblib import Parallel, delayed
    def parallel_stack_model(model_path):
        try:
            cv = re.search(r'CV([^/.]*)_LB.gz', model_path).group(1)
        except AttributeError:
            cv = re.search(r'CV([^/.]*).gz', model_path.replace('.', '-')).group(1)
        tmp = utils.read_pkl_gzip(model_path)
        if key not in tmp.columns:
            tmp.reset_index(inplace=True)
        if 'pred_mean' in tmp.columns:
            tmp = tmp[[key, 'pred_mean']]
        else:
            tmp = tmp[[key, 'prediction']]
            
        if model_path.count('lgb'):
            tmp.columns = [key, f"base_lgb_{cv}"]
        elif model_path.count('NN'):
            tmp.columns = [key, f"base_NN_{cv}"]
        elif model_path.count('ridge'):
            tmp.columns = [key, f"base_ridge_{cv}"]
        elif model_path.count('rmf'):
            tmp.columns = [key, f"base_rmf_{cv}"]
        elif model_path.count('ext'):
            tmp.columns = [key, f"base_ext_{cv}"]
        else:
            tmp.columns = [key, f"base_model_{cv}"]
        return tmp.set_index(key)
    #========================================================================
    
    p_list = Parallel(n_jobs=-1)([delayed(parallel_stack_model)(model_path) for model_path in ens_list])
    df_pred = pd.concat(p_list, axis=1)
    if is_rm_out:
        cv15 = [col for col in df_pred.columns if col.count('1-5')]
        cv8 = [col for col in df_pred.columns if col.count('8-')]
        df_pred['tmp_mean'] = df_pred[cv8].mean(axis=1).values
        for col in cv15:
            df_pred.loc[df_pred[col].isnull(), col] = df_pred.loc[df_pred[col].isnull(), 'tmp_mean']
    base = base.join(df_pred)
    
    #========================================================================
    
    if key in base.columns:
        train = base[~base[target].isnull()]
        test = base[base[target].isnull()]
    else:
        train = base[~base[target].isnull()].reset_index()
        test = base[base[target].isnull()].reset_index()
    
    if is_rm_out:
        train = train[~train[target].isnull()]
    elif is_clf_out:
        train = train[train['clf_pred']<0.01]
        test = test[test['clf_pred']<0.01]
    elif is_binary:
        train[target] = train[target].map(lambda x: 1 if x<-30 else 0)
        
#     display(train.head())
    
    return train, test

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
from sklearn.linear_model import LogisticRegression

base = utils.read_df_pkl('../input/base_no_out_clf.gz')[[key, target, 'clf_pred']].set_index(key)
valid_type = ['ods', 'pmo' ,'pm' ,'term'][0]
# lgb_path = '../ensemble/pmo_all_stack_level1/*.gz'
# lgb_path = '../model/LB3664_set/*_lgb_*.gz'
lgb_path = '../ensemble/dir_stack_blend/tmp/*_lgb_*.gz'
# lgb_path = '../ensemble/rm_outlier_ensemble/tmp/*_lgb_*.gz'
#========================================================================
# Make Dataset 
is_clf_out = [True, False][1]
is_no_out_flg = [True, False][1]
is_rm_out = [True, False][1]
is_binary = [True, False][1]
is_blend = [True, False][1]
is_nn = 0
is_rid = 0
is_rmf = 0
is_ext = 0
is_random = 0
seed_size = 1
pred_col = 'prediction'
#========================================================================
    
#========================================================================
# CVの準備
seed = 328
fold_seed = 328
fold_seed = 1208
seed_list = [328, 1208]
seed_list += list(range(30))
# seed_list = [328]
fold = 6

if is_rm_out:
    set_type = 'rm_out'
else:
    set_type = 'all'

#========================================================================
# Dataset
submit = pd.read_csv('../input/sample_submission.csv').set_index(key)
result_list = []
score_list = []
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg', 'clf_pred']
#========================================================================
    
#========================================================================
# NN Model Setting 
params = {}
if is_binary:
    params['n_jobs']=-1
    params['C']=1.0
    params['solver'] ='liblinear'
    params['fit_intercept']=True
    params['max_iter']=1000
    params['tol']=0.01
    params['random_state']=seed
    model = LogisticRegression(**params)
else:
    params['solver'] ='auto'
    params['fit_intercept']=True
    params['alpha']=0.4
    params['max_iter']=1000
    params['normalize']=False
    params['tol']=0.01
    params['random_state']=seed
    model = Ridge(**params)
    
lazy_target = base[base['clf_pred']<0.01].reset_index()
eazy_target = base[base['clf_pred']>=0.01].reset_index()

valid_list = [lazy_target, eazy_target]

train, test = get_stack_dataset(lgb_path=lgb_path, is_rmf=is_rmf, is_ext=is_ext, is_nn=is_nn, is_random=is_random, seed=seed)
if is_rm_out:
    train = train[train[target]>-30]

oof_pred = np.zeros(len(base))
seed_pred = np.zeros(len(test))

for seed_no, fold_seed in enumerate(seed_list):
    
#     if is_rm_out:
#         kfold = utils.read_pkl_gzip('../input/kfold_ods_no_out_fold6_seed328.gz')
#     elif is_clf_out:
#         kfold = utils.read_pkl_gzip('../input/kfold_ods_clf_out_fold6_seed328.gz')
#     else:
# #         kfold = utils.read_pkl_gzip(f'../input/kfold_{valid_type}_all_fold6_seed{fold_seed}.gz')
#         kfold = utils.read_pkl_gzip(f'../input/kfold_ods_equal_seed328.gz')
    kfold = utils.get_kfold(valid_list, fold_seed)    
     
    #========================================================================
    # Preset
    use_cols = sorted([col for col in train.columns if col.count('base_')])
    lgb_list = [col for col in use_cols if col.count('lgb')]
    nn_list = [col for col in use_cols if col.count('NN')]
    ext_list = [col for col in use_cols if col.count('ext')]
    best_score = 100
    best_score_list = []
    test_pred = np.zeros(len(test))
    Y = train[target]
    result_list = []
    #========================================================================

    #========================================================================
    # Train & Prediction Start
    for fold_no, (trn_idx, val_idx) in enumerate(zip(*kfold)):
        if is_blend:
            break
            
        if key not in train.columns:
            train = train.reset_index()
            test = test.reset_index() 
             
        #========================================================================
        # Make Dataset
        scaler = StandardScaler()
        scaler.fit(pd.concat([train[use_cols], test[use_cols]]))
        x_test = scaler.transform(test[use_cols])

        X_train, y_train = train.loc[train[key].isin(trn_idx), :][use_cols], Y.loc[train[key].isin(trn_idx)]
        X_val, y_val = train.loc[train[key].isin(val_idx), :][use_cols], Y.loc[train[key].isin(val_idx)]
        
        X_train[:] = scaler.transform(X_train)
        X_val[:] = scaler.transform(X_val)
        X_train = X_train.as_matrix()
        X_val = X_val.as_matrix()
    
#         print(f"Train: {X_train.shape} | Valid: {X_val.shape} | Test: {x_test.shape}")
        #========================================================================
        
        # Fitting
#         print(X_train[:5])
        model.fit(X_train, y_train)
        
        # Prediction
        if is_binary:
            y_pred = model.predict_proba(X_val)[:, 1]
#         elif is_rm_out:
#             X_val = train.loc[~train[key].isin(trn_idx), :]
#             y_pred = model.predict(X_val[use_cols])
#             y_val = X_val[target].values
        else:
            y_pred = model.predict(X_val)
        
        test_pred += model.predict(x_test)
        
        # Stack Prediction
#         if is_rm_out:
        if False:
            if fold_no==0:
                df_pred = train[[key, target]].set_index(key)
            self_valid = X_val[[key, target]].set_index(key)
            self_valid[f'pred_{fold_no}'] = y_pred
            df_pred = df_pred.join(self_valid.drop(target, axis=1))
        else:
            df_pred = train.loc[train[key].isin(val_idx), :][[key, target]].copy()
            df_pred['prediction'] = y_pred
            result_list.append(df_pred)
        
        # Scoring
        if is_binary:
            score = np.sqrt(roc_auc_score(y_val, y_pred))
        else:
            score = np.sqrt(mean_squared_error(y_val, y_pred))
        score_list.append(score)
        #========================================================================
    
    cv_score = np.mean(score_list)

    #========================================================================
    # Stacking
    test_pred /= fold_no+1
    test['prediction'] = test_pred
#     seed_pred += test_pred
#     test['prediction'] = seed_pred / (seed_no+1)
    df_pred = test[[key, 'prediction']]
    print(df_pred['prediction'].values[:10])
    result_list.append(df_pred)
    
    df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target, axis=1)
    if key not in base:
        base.reset_index(inplace=True)
    df_pred = base[[key, target]].merge(df_pred, how='inner', on=key)
    
    oof_pred += df_pred['prediction'].values
    
    #========================================================================
    # outlierに対するスコアを出す
    if is_rm_out or is_binary:
        out_score = 0
    else:
        if key not in train.columns:
            train.reset_index(inplace=True)
        out_ids = train.loc[train.target<-30, key].values
        out_val = train.loc[train.target<-30, target].values
        out_pred = df_pred[df_pred[key].isin(out_ids)]['prediction'].values
        out_score = np.sqrt(mean_squared_error(out_val, out_pred))
    #========================================================================
    
    check_oof = oof_pred/(seed_no+1)
    print(f'''
#========================================================================
# Seed: {fold_seed} | CV SCORE AVG: {cv_score} | OUT SCORE: {out_score} | {check_oof[:3]} | {check_oof[-3:]}
#========================================================================''')
    if cv_score<best_score:
    
        best_score = cv_score
        best_score_list = use_cols
        
oof_pred = oof_pred / (seed_no+1)
df_pred['prediction'] = oof_pred
display(df_pred.head())
display(df_pred.tail())
        
#========================================================================
# Save Stack
utils.to_pkl_gzip(path=f"../stack/{start_time[4:12]}_stack_{model_type}_set-{set_type}_valid-{valid_type}-seed{fold_seed}_lgb{len(lgb_list)}_NN{is_nn}_ridge{is_rid}_ext{is_ext}_rmf{is_rmf}_OUT{str(out_score)[:7]}_CV{cv_score}_LB" , obj=df_pred[[key, 'prediction']])
#========================================================================
sys.exit()
    
#========================================================================
# Submission
df_pred.set_index(key, inplace=True)
submit[target] = df_pred['prediction']
submit_path = f'../submit/{start_time[4:12]}_submit_{model_type}_set-{set_type}_lgb{len(lgb_list)}_NN{len(nn_list)}_other{len(other_list)}_OUT{str(out_score)[:7]}_CV{cv_score}_LB.csv'
submit.to_csv(submit_path, index=True)
display(submit.head())
#========================================================================

100%|██████████| 1/1 [00:00<00:00,  4.15it/s]


reading ../input/base_no_out_clf.gz
[-1.3624335  -0.35634117 -1.33485325 -0.14867652 -1.29896603  0.34796406
  0.19205029  0.39484679 -0.32554316 -1.40029491]

# Seed: 328 | CV SCORE AVG: 3.6111670791080415 | OUT SCORE: 29.64638213264338 | [-0.33477421 -0.63591849  0.68853019] | [ 0.76240706 -3.55185174  0.22186158]
[-1.36111577 -0.35598428 -1.33514907 -0.14841262 -1.29937814  0.34855701
  0.19233105  0.3945477  -0.32541478 -1.40260581]

# Seed: 1208 | CV SCORE AVG: 3.611534846672558 | OUT SCORE: 29.646627034446844 | [-0.34121005 -0.66913031  0.71799634] | [ 0.76291356 -3.55738142  0.22227832]
[-1.36052866 -0.35528314 -1.33571339 -0.14817937 -1.29893361  0.34862042
  0.19245375  0.39562489 -0.32542207 -1.40058449]

# Seed: 0 | CV SCORE AVG: 3.61174124412314 | OUT SCORE: 29.647661119375414 | [-0.33649032 -0.63462642  0.72152572] | [ 0.76304448 -3.55714722  0.22223222]
[-1.36027002 -0.3557322  -1.33629153 -0.14841314 -1.29814323  0.34817981
  0.19210584  0.39597519 -0.32500054 -1.4009758

Unnamed: 0,card_id,target,prediction
0,C_ID_92a2005557,-0.820283,-0.335461
1,C_ID_3d0044924f,0.392913,-0.648348
2,C_ID_d639edf6cd,0.688056,0.723076
3,C_ID_186d6a6901,0.142495,0.183964
4,C_ID_cdbd2c0db2,-0.159749,-0.100282


Unnamed: 0,card_id,target,prediction
325535,C_ID_7a239d2eda,,1.018862
325536,C_ID_75ace375ae,,-1.024097
325537,C_ID_21d56d950c,,0.763309
325538,C_ID_6c46fc5a9d,,-3.558101
325539,C_ID_87e7979a5f,,0.222246


SystemExit: 

In [35]:
utils.to_pkl_gzip(path=f"../stack/{start_time[4:12]}_stack_{model_type}_set-{set_type}_valid-{valid_type}-seed{fold_seed}_lgb{len(lgb_list)}_NN{is_nn}_ridge{is_rid}_ext{is_ext}_rmf{is_rmf}_OUT{str(out_score)[:7]}_CV{cv_score}_LB" , obj=df_pred[[key, 'prediction']])
display(df_pred.head())
display(df_pred.tail())

Unnamed: 0,card_id,target,prediction
0,C_ID_92a2005557,-0.820283,-0.335461
1,C_ID_3d0044924f,0.392913,-0.648348
2,C_ID_d639edf6cd,0.688056,0.723076
3,C_ID_186d6a6901,0.142495,0.183964
4,C_ID_cdbd2c0db2,-0.159749,-0.100282


Unnamed: 0,card_id,target,prediction
325535,C_ID_7a239d2eda,,1.018862
325536,C_ID_75ace375ae,,-1.024097
325537,C_ID_21d56d950c,,0.763309
325538,C_ID_6c46fc5a9d,,-3.558101
325539,C_ID_87e7979a5f,,0.222246


In [37]:
# x = pd.DataFrame(X_train)
for col in train.columns:
    tmp = train[col].isnull().sum()
    if not(col.count('base')):continue
    if tmp>0:
        print(col)
        display(train[train[col].isnull()])

base_NN_521-7779425059342


Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,...,base_lgb_3-627555_LB3-675,base_lgb_3-6179960053791236_LB3-667,base_lgb_3-6236254858483243,base_lgb_3-646290595477581,base_lgb_3-6333204401002663,base_lgb_3-6226545066935465,base_lgb_3-638811137215022,base_lgb_3-653674740088933,base_NN_549-4781753401306,base_NN_521-7779425059342
92,C_ID_b9379a30ea,-33.219281,2015-05-01,2017-08-10 11:26:28,2017-09-01,2017-01-02 11:53:33,2017-01-01,2018-01-13 10:41:01,2018-02-01,2018-01-09 11:40:28,...,-3.414548,-4.719915,-5.325719,-3.345237,-3.937165,-4.671265,-3.183540,-3.367472,0.415189,
1215,C_ID_3b60cf95bd,-2.241354,2016-09-01,2017-11-14 11:19:23,2017-12-01,2017-02-08 08:29:50,2017-02-01,2018-01-22 16:17:41,2018-02-01,2017-12-01 11:31:43,...,-3.977318,-4.017489,-4.270897,-4.243064,-4.050524,-3.665178,-3.778156,-3.292780,0.253382,
1254,C_ID_2edb57732a,-1.661818,2016-02-01,2017-12-22 13:56:17,2018-01-01,2017-01-18 13:34:21,2017-01-01,2018-01-19 11:13:53,2018-02-01,2018-01-11 20:08:11,...,-4.697244,-5.102153,-5.636354,-4.708135,-4.191139,-5.176814,-4.447089,-2.839778,0.346905,
2563,C_ID_1985211b7d,-6.081785,2014-11-01,2017-11-20 21:48:19,2017-12-01,2017-02-08 18:04:56,2017-02-01,2017-12-22 21:57:46,2018-01-01,2017-12-09 17:26:45,...,-3.197498,-4.069769,-3.273837,-2.942458,-3.026949,-3.026823,-3.275355,-4.286843,0.293987,
3842,C_ID_929cf67c44,-0.687414,2017-06-01,2017-12-31 15:59:00,2018-01-01,2017-06-16 20:59:12,2017-06-01,2018-01-27 21:27:45,2018-02-01,2018-01-02 17:05:22,...,-15.984479,-19.520808,-18.666542,-17.050479,-17.400247,-19.571258,-16.096455,-15.014532,0.526958,
4768,C_ID_26f05f2bb9,-33.219281,2017-03-01,2017-10-26 16:57:44,2017-11-01,2017-03-02 19:57:36,2017-03-01,2017-11-11 22:15:29,2017-12-01,2017-11-11 22:15:29,...,-13.525431,-14.052598,-14.049869,-12.290517,-15.208868,-15.829697,-11.381348,-15.666544,0.630741,
5868,C_ID_18c1e4935e,-3.057246,2017-02-01,2017-11-26 03:08:45,2017-12-01,2017-02-07 17:40:21,2017-02-01,2017-12-09 12:16:07,2018-01-01,2017-12-09 12:16:07,...,-12.666345,-11.222481,-12.170078,-15.370605,-14.443614,-15.434300,-15.574039,-11.675666,0.527972,
16649,C_ID_6987fbebce,-33.219281,2017-02-01,2017-09-26 11:16:08,2017-10-01,2017-02-24 08:22:20,2017-02-01,NaT,NaT,NaT,...,-8.685602,-9.493633,-9.930033,-7.991375,-9.872197,-11.291186,-5.716521,-9.670458,0.957080,
18449,C_ID_d9d202cb1d,-33.219281,2016-03-01,2018-01-12 14:02:21,2018-02-01,2017-01-13 10:23:12,2017-01-01,2018-03-19 00:33:05,2018-04-01,2018-02-21 08:59:39,...,-7.709654,-14.474654,-11.457523,-7.013136,-10.777880,-11.059271,-6.074511,-5.028375,0.278307,
22187,C_ID_b12963ffb9,-33.219281,2017-01-01,2017-10-31 01:54:18,2017-11-01,2017-01-21 12:39:41,2017-01-01,2017-11-09 00:00:00,2017-12-01,2017-11-07 00:00:00,...,-15.189049,-12.065856,-11.313930,-10.474757,-17.966222,-10.104348,-12.473513,-14.690118,0.447016,


In [58]:
for col in use_cols:
    tmp = train[col].isnull().sum()
    if tmp:
        print(col)

base_lgb_1-5486783250130114
base_lgb_1-5520199746035688
base_lgb_1-5520923703589151
base_lgb_1-553086826800966
base_lgb_1-5540396414459288
