In [134]:
is_stack = [True, False][0]
debug = False
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
import numpy as np
import os
import sys
import time
import datetime
import glob
sys.path.append('../py/')
from s027_kfold_ods import ods_kfold
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature, impute_feature
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

#========================================================================
# Keras 
# Corporación Favorita Grocery Sales Forecasting
from sklearn.linear_model import Ridge
#========================================================================


#========================================================================
# Args
out_part = ['', 'part', 'all'][0]
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg']
stack_name='ridge'
submit = pd.read_csv('../input/sample_submission.csv')
model_type='ridge'
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
seed = 328
#========================================================================

#========================================================================
# Data Load 
def get_stack_dataset(lgb_path='', is_clf_out=False, is_no_out_flg=False, is_rm_out=False, is_binary=False, is_nn=False, is_rmf=False, is_ext=False, is_rid=False, is_random=False, seed=seed):
    print("Preparing dataset...")
    base = utils.read_df_pkl('../input/base_no_out_clf.gz').set_index(key)
    
    #========================================================================
    # Base Model Path
    #========================================================================
    # Clf Out Model
    if is_clf_out: ens_list = glob.glob('../ensemble/clf_min_thres_ensemble/*.gz')
    # No Out Flg Model
    elif is_no_out_flg: ens_list = glob.glob('../no_out_flg_ensemble/*.gz')
    elif is_rm_out: ens_list = glob.glob('../ensemble/rm_outlier_ensemble/*.gz')
    elif is_binary:
        model_type='lgr'
        lgb_list = glob.glob('../stack/*binary*.gz')
        nn_list = []
        ens_list = lgb_list + nn_list
    #========================================================================
    # Base Model
    else:
        if is_random:
            np.random.seed(seed)
            lgb_list = list(np.random.choice(lgb_list, 10))
#             nn_list = list(np.random.choice(nn_list, 1))
        nn_list = []
        rid_list = []
        ext_list = []
        rmf_list = []
        if is_nn : nn_list = glob.glob('../ensemble/NN_ensemble/*CV3*.gz')
        if is_rmf: rmf_list = glob.glob('../ensemble/various_model/*rmf*.gz')
        if is_ext: ext_list = glob.glob('../ensemble/various_model/*ext*.gz')
        if is_rid: rid_list = glob.glob('../ensemble/various_model/*ridge*.gz')
        lgb_list = glob.glob(lgb_path)
        ens_list = lgb_list + nn_list + rid_list + rmf_list + ext_list
    
    #========================================================================
    # Stack Models Load
    from joblib import Parallel, delayed
    def parallel_stack_model(model_path):
        try:
            cv = re.search(r'CV([^/.]*)_LB.gz', model_path).group(1)
        except AttributeError:
            cv = re.search(r'CV([^/.]*).gz', model_path.replace('.', '-')).group(1)
        tmp = utils.read_pkl_gzip(model_path)
        if key not in tmp.columns:
            tmp.reset_index(inplace=True)
        if 'pred_mean' in tmp.columns:
            tmp = tmp[[key, 'pred_mean']]
        else:
            tmp = tmp[[key, 'prediction']]
            
        if model_path.count('lgb'):
            tmp.columns = [key, f"base_lgb_{cv}"]
        elif model_path.count('NN'):
            tmp.columns = [key, f"base_NN_{cv}"]
        elif model_path.count('ridge'):
            tmp.columns = [key, f"base_ridge_{cv}"]
        elif model_path.count('rmf'):
            tmp.columns = [key, f"base_rmf_{cv}"]
        elif model_path.count('ext'):
            tmp.columns = [key, f"base_ext_{cv}"]
        else:
            tmp.columns = [key, f"base_model_{cv}"]
        return tmp.set_index(key)
    #========================================================================
    
    p_list = Parallel(n_jobs=-1)([delayed(parallel_stack_model)(model_path) for model_path in ens_list])
    df_pred = pd.concat(p_list, axis=1)
    if is_rm_out:
        cv15 = [col for col in df_pred.columns if col.count('1-5')]
        cv8 = [col for col in df_pred.columns if col.count('8-')]
        df_pred['tmp_mean'] = df_pred[cv8].mean(axis=1).values
        for col in cv15:
            df_pred.loc[df_pred[col].isnull(), col] = df_pred.loc[df_pred[col].isnull(), 'tmp_mean']
    base = base.join(df_pred)
    
    #========================================================================
    
    if key in base.columns:
        train = base[~base[target].isnull()]
        test = base[base[target].isnull()]
    else:
        train = base[~base[target].isnull()].reset_index()
        test = base[base[target].isnull()].reset_index()
    
    if is_rm_out:
        train = train[~train[target].isnull()]
    elif is_clf_out:
        train = train[train['clf_pred']<0.01]
        test = test[test['clf_pred']<0.01]
    elif is_binary:
        train[target] = train[target].map(lambda x: 1 if x<-30 else 0)
        
    display(train.head())
    
    return train, test

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [149]:
from sklearn.linear_model import LogisticRegression

valid_type = ['ods', 'pmo' ,'pm' ,'term'][3]
lgb_path = '../ensemble/pmo_all_stack_level1/*.gz'
lgb_path = '../model/LB3664_set/*_lgb_*.gz'
lgb_path = '../ensemble/dir_stack_blend/tmp/*_lgb_*.gz'
#========================================================================
# Make Dataset 
is_clf_out = [True, False][1]
is_no_out_flg = [True, False][1]
is_rm_out = [True, False][1]
is_binary = [True, False][1]
is_nn = 0
is_rid = 0
is_rmf = 0
is_ext = 0
is_random = 0
seed_size = 1
#========================================================================
    
#========================================================================
# CVの準備
seed = 328
fold_seed = 328
fold_seed = 1208
seed_list = [328, 1208]
fold = 6

if is_rm_out:
    set_type = 'rm_out'
else:
    set_type = 'all'

#========================================================================
# Dataset
submit = pd.read_csv('../input/sample_submission.csv').set_index(key)
result_list = []
score_list = []
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg', 'clf_pred']
#========================================================================
    
#========================================================================
# NN Model Setting 
params = {}
if is_binary:
    params['n_jobs']=-1
    params['C']=1.0
    params['solver'] ='liblinear'
    params['fit_intercept']=True
    params['max_iter']=1000
    params['tol']=0.01
    params['random_state']=seed
    model = LogisticRegression(**params)
else:
    params['solver'] ='auto'
    params['fit_intercept']=True
    params['alpha']=0.4
    params['max_iter']=1000
    params['normalize']=False
    params['tol']=0.01
    params['random_state']=seed
    model = Ridge(**params)

# np.random.seed(int(time.time()))
# seed_list = np.random.randint(10**7, size=seed_size)

# for seed in seed_list:
for fold_seed in seed_list:
    
    if is_rm_out:
        kfold = utils.read_pkl_gzip('../input/kfold_ods_no_out_fold6_seed328.gz')
    elif is_clf_out:
        kfold = utils.read_pkl_gzip('../input/kfold_ods_clf_out_fold6_seed328.gz')
    else:
        kfold = utils.read_pkl_gzip(f'../input/kfold_{valid_type}_all_fold6_seed{fold_seed}.gz')
    
    train, test = get_stack_dataset(lgb_path=lgb_path, is_rmf=is_rmf, is_ext=is_ext, is_random=is_random, seed=seed)
        
    #========================================================================
    # Preset
    use_cols = sorted([col for col in train.columns if col.count('base_')])
    best_score = 100
    best_score_list = []
    test_pred = np.zeros(len(test))
    Y = train[target]
    result_list = []
    #========================================================================

    #========================================================================
    # Train & Prediction Start
    for fold_no, (trn_idx, val_idx) in enumerate(zip(*kfold)):
        if key not in train.columns:
            train = train.reset_index()
            test = test.reset_index() 
             
        #========================================================================
        # Make Dataset
        scaler = StandardScaler()
        scaler.fit(pd.concat([train[use_cols], test[use_cols]]))
        x_test = scaler.transform(test[use_cols])

        X_train, y_train = train.loc[train[key].isin(trn_idx), :][use_cols], Y.loc[train[key].isin(trn_idx)]
        X_val, y_val = train.loc[train[key].isin(val_idx), :][use_cols], Y.loc[train[key].isin(val_idx)]
        
        X_train[:] = scaler.transform(X_train)
        X_val[:] = scaler.transform(X_val)
        X_train = X_train.as_matrix()
        X_val = X_val.as_matrix()
    
        print(f"Train: {X_train.shape} | Test: {x_test.shape}")
        #========================================================================
        
        # Fitting
        model.fit(X_train, y_train)
        
        # Prediction
        if is_binary:
            y_pred = model.predict_proba(X_val)[:, 1]
        elif is_rm_out:
            X_val = train.loc[~train[key].isin(trn_idx), :]
            y_pred = model.predict(X_val[use_cols])
            y_val = X_val[target].values
        else:
            y_pred = model.predict(X_val)
        
        test_pred += model.predict(x_test)
        
        # Stack Prediction
        if is_rm_out:
            if fold_no==0:
                df_pred = train[[key, target]].set_index(key)
            self_valid = X_val[[key, target]].set_index(key)
            self_valid[f'pred_{fold_no}'] = y_pred
            df_pred = df_pred.join(self_valid.drop(target, axis=1))
        else:
            df_pred = train.loc[train[key].isin(val_idx), :][[key, target]].copy()
            df_pred['prediction'] = y_pred
            result_list.append(df_pred)
        
        # Scoring
        err = (y_val - y_pred)
        if is_binary:
            score = np.sqrt(roc_auc_score(y_val, y_pred))
            print(f'AUC: {score} | SUM ERROR: {err.sum()}')
        else:
            score = np.sqrt(mean_squared_error(y_val, y_pred))
            print(f'RMSE: {score} | SUM ERROR: {err.sum()}')
        score_list.append(score)
        #========================================================================
    
    cv_score = np.mean(score_list)
    
    #========================================================================
    # Stacking
    test_pred /= fold_no+1
    test['prediction'] = test_pred
    stack_test = test[[key, 'prediction']]
    
    if is_rm_out:
        pred_col_list = [col for col in df_pred.columns if col.count('pred_')]
        df_pred['prediction'] = df_pred[pred_col_list].mean(axis=1)
        result_list.append(df_pred.reset_index())
    
    result_list.append(stack_test)
    df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target, axis=1)
    if key not in base:
        base.reset_index(inplace=True)
    df_pred = base[[key, target]].merge(df_pred, how='inner', on=key)
    print(f"Stacking Shape: {df_pred.shape}")
    #========================================================================
    
    #========================================================================
    # outlierに対するスコアを出す
    if is_rm_out or is_binary:
        out_score = 0
    else:
        if key not in train.columns:
            train.reset_index(inplace=True)
        out_ids = train.loc[train.target<-30, key].values
        out_val = train.loc[train.target<-30, target].values
        out_pred = df_pred[df_pred[key].isin(out_ids)]['prediction'].values
        out_score = np.sqrt(mean_squared_error(out_val, out_pred))
    #========================================================================
    
    if cv_score<best_score:
        print(f'''
#========================================================================
# CV SCORE AVG: {cv_score}
# OUT SCORE: {out_score}
#========================================================================''')
    
        best_score = cv_score
        best_score_list = use_cols
    
        #========================================================================
        # Save Stack
        utils.to_pkl_gzip(path=f"../stack/{start_time[4:12]}_stack_{model_type}_set-{set_type}_valid-{valid_type}-seed{fold_seed}_lgb{len(lgb_list)}_NN{is_nn}_ridge{is_rid}_ext{is_ext}_rmf{is_rmf}_level1{is_level1}_OUT{str(out_score)[:7]}_CV{cv_score}_LB" , obj=df_pred[[key, 'prediction']])
        #========================================================================
sys.exit()
    
#========================================================================
# Submission
df_pred.set_index(key, inplace=True)
submit[target] = df_pred['prediction']
submit_path = f'../submit/{start_time[4:12]}_submit_{model_type}_set-{set_type}_lgb{len(lgb_list)}_NN{len(nn_list)}_other{len(other_list)}_OUT{str(out_score)[:7]}_CV{cv_score}_LB.csv'
submit.to_csv(submit_path, index=True)
display(submit.head())
#========================================================================

  0%|          | 0/1 [00:00<?, ?it/s]

Preparing dataset...


100%|██████████| 1/1 [00:00<00:00,  4.01it/s]


Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,...,base_lgb_3-6256427264251463,base_lgb_3-6246353407463157,base_lgb_3-625177466404422,base_lgb_3-627555_LB3-675,base_lgb_3-6236254858483243,base_lgb_3-6206463759490277,base_lgb_3-6333204401002663,base_lgb_3-6226545066935465,base_lgb_3-6221003122973614,base_lgb_3-653674740088933
0,C_ID_92a2005557,-0.820283,2017-06-01,2018-02-25 09:31:15,2018-03-01,2017-06-27 14:18:08,2017-06-01,2018-04-29 11:23:05,2018-05-01,2018-03-05 14:04:36,...,-0.405107,-0.348467,-0.373584,-0.349346,-0.344292,-0.346904,-0.331135,-0.340694,-0.362157,-0.181378
1,C_ID_3d0044924f,0.392913,2017-01-01,2018-01-31 22:31:09,2018-02-01,2017-01-06 16:29:42,2017-01-01,2018-03-30 06:48:26,2018-04-01,2018-02-01 17:07:54,...,-0.108786,-0.746501,-0.309417,-0.62714,-0.349898,-0.568272,-0.48417,-0.675536,-0.074936,-0.652553
2,C_ID_d639edf6cd,0.688056,2016-08-01,2018-02-27 19:08:25,2018-03-01,2017-01-11 08:21:22,2017-01-01,2018-04-28 17:43:11,2018-05-01,2018-04-28 17:43:11,...,0.795536,0.677786,0.5719,0.648794,0.803631,0.683088,0.642721,0.759212,0.788112,0.584152
3,C_ID_186d6a6901,0.142495,2017-09-01,2018-02-28 11:44:40,2018-03-01,2017-09-26 16:22:21,2017-09-01,2018-04-18 11:00:11,2018-05-01,2018-03-07 11:55:06,...,0.176037,0.15374,0.150416,0.111918,0.137893,0.173934,0.108993,0.168531,0.153302,0.205327
4,C_ID_cdbd2c0db2,-0.159749,2017-11-01,2018-02-28 20:40:41,2018-03-01,2017-11-12 00:00:00,2017-11-01,2018-04-28 18:50:25,2018-05-01,2018-03-02 11:55:43,...,-0.07684,-0.078756,-0.116971,-0.182233,-0.120741,-0.109142,-0.106126,-0.087869,-0.078456,-0.205778


Train: (168250, 23) | Test: (123623, 23)
RMSE: 3.6062391304330563 | SUM ERROR: 211.90575635372068
Train: (168252, 23) | Test: (123623, 23)
RMSE: 3.604735022876224 | SUM ERROR: -384.773959455969
Train: (168261, 23) | Test: (123623, 23)
RMSE: 3.6170146025631005 | SUM ERROR: -661.8128949605887
Train: (168270, 23) | Test: (123623, 23)
RMSE: 3.6397685031709086 | SUM ERROR: 491.48112999400485
Train: (168274, 23) | Test: (123623, 23)
RMSE: 3.633461690998905 | SUM ERROR: 82.22980478608544
Train: (168278, 23) | Test: (123623, 23)
RMSE: 3.60288596412451 | SUM ERROR: 253.23574947328927
Stacking Shape: (325540, 3)

# CV SCORE AVG: 3.6173508190277843
# OUT SCORE: 29.70522110784238


  0%|          | 0/1 [00:00<?, ?it/s]

Preparing dataset...


100%|██████████| 1/1 [00:00<00:00,  3.95it/s]


Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,...,base_lgb_3-6256427264251463,base_lgb_3-6246353407463157,base_lgb_3-625177466404422,base_lgb_3-627555_LB3-675,base_lgb_3-6236254858483243,base_lgb_3-6206463759490277,base_lgb_3-6333204401002663,base_lgb_3-6226545066935465,base_lgb_3-6221003122973614,base_lgb_3-653674740088933
0,C_ID_92a2005557,-0.820283,2017-06-01,2018-02-25 09:31:15,2018-03-01,2017-06-27 14:18:08,2017-06-01,2018-04-29 11:23:05,2018-05-01,2018-03-05 14:04:36,...,-0.405107,-0.348467,-0.373584,-0.349346,-0.344292,-0.346904,-0.331135,-0.340694,-0.362157,-0.181378
1,C_ID_3d0044924f,0.392913,2017-01-01,2018-01-31 22:31:09,2018-02-01,2017-01-06 16:29:42,2017-01-01,2018-03-30 06:48:26,2018-04-01,2018-02-01 17:07:54,...,-0.108786,-0.746501,-0.309417,-0.62714,-0.349898,-0.568272,-0.48417,-0.675536,-0.074936,-0.652553
2,C_ID_d639edf6cd,0.688056,2016-08-01,2018-02-27 19:08:25,2018-03-01,2017-01-11 08:21:22,2017-01-01,2018-04-28 17:43:11,2018-05-01,2018-04-28 17:43:11,...,0.795536,0.677786,0.5719,0.648794,0.803631,0.683088,0.642721,0.759212,0.788112,0.584152
3,C_ID_186d6a6901,0.142495,2017-09-01,2018-02-28 11:44:40,2018-03-01,2017-09-26 16:22:21,2017-09-01,2018-04-18 11:00:11,2018-05-01,2018-03-07 11:55:06,...,0.176037,0.15374,0.150416,0.111918,0.137893,0.173934,0.108993,0.168531,0.153302,0.205327
4,C_ID_cdbd2c0db2,-0.159749,2017-11-01,2018-02-28 20:40:41,2018-03-01,2017-11-12 00:00:00,2017-11-01,2018-04-28 18:50:25,2018-05-01,2018-03-02 11:55:43,...,-0.07684,-0.078756,-0.116971,-0.182233,-0.120741,-0.109142,-0.106126,-0.087869,-0.078456,-0.205778


Train: (168250, 23) | Test: (123623, 23)
RMSE: 3.6344127390247163 | SUM ERROR: -1.1143795029709622
Train: (168252, 23) | Test: (123623, 23)
RMSE: 3.6090662098190114 | SUM ERROR: -362.1538157195994
Train: (168261, 23) | Test: (123623, 23)
RMSE: 3.626120207522003 | SUM ERROR: 131.83835472313496
Train: (168270, 23) | Test: (123623, 23)
RMSE: 3.6138963885894646 | SUM ERROR: 135.5430129000438
Train: (168274, 23) | Test: (123623, 23)
RMSE: 3.604017192093228 | SUM ERROR: -173.35577404585612
Train: (168278, 23) | Test: (123623, 23)
RMSE: 3.612002447697715 | SUM ERROR: 291.93206873206907
Stacking Shape: (325540, 3)

# CV SCORE AVG: 3.6169683415760705
# OUT SCORE: 29.69770926785604


SystemExit: 