In [22]:
is_stack = [True, False][0]
debug = False
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
import numpy as np
import os
import sys
import time
import datetime
import glob
sys.path.append('../py/')
from s027_kfold_ods import ods_kfold
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature, impute_feature
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

#========================================================================
# Keras 
# Corporación Favorita Grocery Sales Forecasting
from sklearn.linear_model import Ridge
#========================================================================


#========================================================================
# Args
out_part = ['', 'part', 'all'][0]
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg']
stack_name='ridge'
submit = pd.read_csv('../input/sample_submission.csv')
model_type='ridge'
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
seed = 328
#========================================================================

#========================================================================
# Data Load 
def get_stack_dataset(lgb_path='', is_clf_out=False, is_no_out_flg=False, is_rm_out=False, is_binary=False, is_nn=False, is_rmf=False, is_ext=False, is_rid=False, is_random=False, seed=seed):
    print("Preparing dataset...")
    base = utils.read_df_pkl('../input/base_no_out_clf.gz').set_index(key)
    
    #========================================================================
    # Base Model Path
    #========================================================================
    # Clf Out Model
    if is_clf_out: ens_list = glob.glob('../ensemble/clf_min_thres_ensemble/*.gz')
    # No Out Flg Model
    elif is_no_out_flg: ens_list = glob.glob('../no_out_flg_ensemble/*.gz')
    elif is_rm_out: ens_list = glob.glob('../ensemble/rm_outlier_ensemble/*.gz')
    elif is_binary:
        model_type='lgr'
        lgb_list = glob.glob('../stack/*binary*.gz')
        nn_list = []
        ens_list = lgb_list + nn_list
    #========================================================================
    # Base Model
    else:
        if is_random:
            np.random.seed(seed)
            lgb_list = list(np.random.choice(lgb_list, 10))
#             nn_list = list(np.random.choice(nn_list, 1))
        nn_list = []
        rid_list = []
        ext_list = []
        rmf_list = []
        if is_nn : nn_list = glob.glob('../ensemble/NN_ensemble/*CV3*.gz')
        if is_rmf: rmf_list = glob.glob('../ensemble/various_model/*rmf*.gz')
        if is_ext: ext_list = glob.glob('../ensemble/various_model/*ext*.gz')
        if is_rid: rid_list = glob.glob('../ensemble/various_model/*ridge*.gz')
        lgb_list = glob.glob(lgb_path)
        ens_list = lgb_list + nn_list + rid_list + rmf_list + ext_list
    
    #========================================================================
    # Stack Models Load
    from joblib import Parallel, delayed
    def parallel_stack_model(model_path):
        try:
            cv = re.search(r'CV([^/.]*)_LB.gz', model_path).group(1)
        except AttributeError:
            cv = re.search(r'CV([^/.]*).gz', model_path.replace('.', '-')).group(1)
        tmp = utils.read_pkl_gzip(model_path)
        if key not in tmp.columns:
            tmp.reset_index(inplace=True)
        if 'pred_mean' in tmp.columns:
            tmp = tmp[[key, 'pred_mean']]
        else:
            tmp = tmp[[key, 'prediction']]
            
        if model_path.count('lgb'):
            tmp.columns = [key, f"base_lgb_{cv}"]
        elif model_path.count('NN'):
            tmp.columns = [key, f"base_NN_{cv}"]
        elif model_path.count('ridge'):
            tmp.columns = [key, f"base_ridge_{cv}"]
        elif model_path.count('rmf'):
            tmp.columns = [key, f"base_rmf_{cv}"]
        elif model_path.count('ext'):
            tmp.columns = [key, f"base_ext_{cv}"]
        else:
            tmp.columns = [key, f"base_model_{cv}"]
        return tmp.set_index(key)
    #========================================================================
    
    p_list = Parallel(n_jobs=-1)([delayed(parallel_stack_model)(model_path) for model_path in ens_list])
    df_pred = pd.concat(p_list, axis=1)
    if is_rm_out:
        cv15 = [col for col in df_pred.columns if col.count('1-5')]
        cv8 = [col for col in df_pred.columns if col.count('8-')]
        df_pred['tmp_mean'] = df_pred[cv8].mean(axis=1).values
        for col in cv15:
            df_pred.loc[df_pred[col].isnull(), col] = df_pred.loc[df_pred[col].isnull(), 'tmp_mean']
    base = base.join(df_pred)
    
    #========================================================================
    
    if key in base.columns:
        train = base[~base[target].isnull()]
        test = base[base[target].isnull()]
    else:
        train = base[~base[target].isnull()].reset_index()
        test = base[base[target].isnull()].reset_index()
    
    if is_rm_out:
        train = train[~train[target].isnull()]
    elif is_clf_out:
        train = train[train['clf_pred']<0.01]
        test = test[test['clf_pred']<0.01]
    elif is_binary:
        train[target] = train[target].map(lambda x: 1 if x<-30 else 0)
        
    display(train.head())
    
    return train, test

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
from sklearn.linear_model import LogisticRegression

base = utils.read_df_pkl('../input/base_no_out_clf.gz')[[key, target]].set_index(key)
valid_type = ['ods', 'pmo' ,'pm' ,'term'][3]
# lgb_path = '../ensemble/pmo_all_stack_level1/*.gz'
# lgb_path = '../model/LB3664_set/*_lgb_*.gz'
lgb_path = '../ensemble/dir_stack_blend/*_lgb_*.gz'
# lgb_path = '../ensemble/rm_outlier_ensemble/tmp/*_lgb_*.gz'
#========================================================================
# Make Dataset 
is_clf_out = [True, False][1]
is_no_out_flg = [True, False][1]
is_rm_out = [True, False][0]
is_binary = [True, False][1]
is_blend = [True, False][1]
is_nn = 0
is_rid = 0
is_rmf = 0
is_ext = 0
is_random = 0
seed_size = 1
pred_col = 'prediction'
#========================================================================
    
#========================================================================
# CVの準備
seed = 328
fold_seed = 328
fold_seed = 1208
seed_list = [328, 1208]
seed_list = [328]
fold = 6

if is_rm_out:
    set_type = 'rm_out'
else:
    set_type = 'all'

#========================================================================
# Dataset
submit = pd.read_csv('../input/sample_submission.csv').set_index(key)
result_list = []
score_list = []
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg', 'clf_pred']
#========================================================================
    
#========================================================================
# NN Model Setting 
params = {}
if is_binary:
    params['n_jobs']=-1
    params['C']=1.0
    params['solver'] ='liblinear'
    params['fit_intercept']=True
    params['max_iter']=1000
    params['tol']=0.01
    params['random_state']=seed
    model = LogisticRegression(**params)
else:
    params['solver'] ='auto'
    params['fit_intercept']=True
    params['alpha']=0.4
    params['max_iter']=1000
    params['normalize']=False
    params['tol']=0.01
    params['random_state']=seed
    model = Ridge(**params)

# np.random.seed(int(time.time()))
# seed_list = np.random.randint(10**7, size=seed_size)

# for seed in seed_list:
for fold_seed in seed_list:
    
    if is_rm_out:
        kfold = utils.read_pkl_gzip('../input/kfold_ods_no_out_fold6_seed328.gz')
    elif is_clf_out:
        kfold = utils.read_pkl_gzip('../input/kfold_ods_clf_out_fold6_seed328.gz')
    else:
#         kfold = utils.read_pkl_gzip(f'../input/kfold_{valid_type}_all_fold6_seed{fold_seed}.gz')
        kfold = utils.read_pkl_gzip(f'../input/kfold_ods_equal_seed328.gz')
    
    train, test = get_stack_dataset(lgb_path=lgb_path, is_rmf=is_rmf, is_ext=is_ext, is_random=is_random, seed=seed)
    if is_rm_out:
        train = train[train[target]>-30]
        
    #========================================================================
    # Preset
    use_cols = sorted([col for col in train.columns if col.count('base_')])
    lgb_list = [col for col in use_cols if col.count('lgb')]
    nn_list = [col for col in use_cols if col.count('NN')]
    ext_list = [col for col in use_cols if col.count('ext')]
    best_score = 100
    best_score_list = []
    test_pred = np.zeros(len(test))
    Y = train[target]
    result_list = []
    #========================================================================

    #========================================================================
    # Train & Prediction Start
    for fold_no, (trn_idx, val_idx) in enumerate(zip(*kfold)):
        if is_blend:
            break
            
        if key not in train.columns:
            train = train.reset_index()
            test = test.reset_index() 
             
        #========================================================================
        # Make Dataset
        scaler = StandardScaler()
        scaler.fit(pd.concat([train[use_cols], test[use_cols]]))
        x_test = scaler.transform(test[use_cols])

        X_train, y_train = train.loc[train[key].isin(trn_idx), :][use_cols], Y.loc[train[key].isin(trn_idx)]
        X_val, y_val = train.loc[train[key].isin(val_idx), :][use_cols], Y.loc[train[key].isin(val_idx)]
        
        X_train[:] = scaler.transform(X_train)
        X_val[:] = scaler.transform(X_val)
        X_train = X_train.as_matrix()
        X_val = X_val.as_matrix()
    
        print(f"Train: {X_train.shape} | Valid: {X_val.shape} | Test: {x_test.shape}")
        #========================================================================
        
        # Fitting
        model.fit(X_train, y_train)
        
        # Prediction
        if is_binary:
            y_pred = model.predict_proba(X_val)[:, 1]
#         elif is_rm_out:
#             X_val = train.loc[~train[key].isin(trn_idx), :]
#             y_pred = model.predict(X_val[use_cols])
#             y_val = X_val[target].values
        else:
            y_pred = model.predict(X_val)
        
        test_pred += model.predict(x_test)
        
        # Stack Prediction
#         if is_rm_out:
        if False:
            if fold_no==0:
                df_pred = train[[key, target]].set_index(key)
            self_valid = X_val[[key, target]].set_index(key)
            self_valid[f'pred_{fold_no}'] = y_pred
            df_pred = df_pred.join(self_valid.drop(target, axis=1))
        else:
            df_pred = train.loc[train[key].isin(val_idx), :][[key, target]].copy()
            df_pred['prediction'] = y_pred
            result_list.append(df_pred)
        
        # Scoring
        err = (y_val - y_pred)
        if is_binary:
            score = np.sqrt(roc_auc_score(y_val, y_pred))
            print(f'AUC: {score} | SUM ERROR: {err.sum()}')
        else:
            score = np.sqrt(mean_squared_error(y_val, y_pred))
            print(f'RMSE: {score} | SUM ERROR: {err.sum()}')
        score_list.append(score)
        #========================================================================
    
    if not(is_blend):
        cv_score = np.mean(score_list)
        
        #========================================================================
        # Stacking
        test_pred /= fold_no+1
        test['prediction'] = test_pred
        stack_test = test[[key, 'prediction']]
        
#         if is_rm_out:
#             pred_col_list = [col for col in df_pred.columns if col.count('pred_')]
#             df_pred['prediction'] = df_pred[pred_col_list].mean(axis=1)
#             result_list.append(df_pred.reset_index())
        
        result_list.append(stack_test)
        df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target, axis=1)
        if key not in base:
            base.reset_index(inplace=True)
        df_pred = base[[key, target]].merge(df_pred, how='inner', on=key)
    else:
        #========================================================================
        # Blender
        train[pred_col] = train[use_cols].mean(axis=1)
        test[pred_col] = test[use_cols].mean(axis=1)
        y_pred = train[pred_col].values
        y_val = train[target].values
        
        score = np.sqrt(mean_squared_error(y_val, y_pred))
        print(f'RMSE: {score} | SUM ERROR: {err.sum()}')
        #========================================================================
            
        stack_col = [key, target, pred_col]
        df_pred = pd.concat([train[stack_col], test[stack_col]], axis=0, ignore_index=True)
    print(f"Stacking Shape: {df_pred.shape}")
    #========================================================================
    
    #========================================================================
    # outlierに対するスコアを出す
    if is_rm_out or is_binary:
        out_score = 0
    else:
        if key not in train.columns:
            train.reset_index(inplace=True)
        out_ids = train.loc[train.target<-30, key].values
        out_val = train.loc[train.target<-30, target].values
        out_pred = df_pred[df_pred[key].isin(out_ids)]['prediction'].values
        out_score = np.sqrt(mean_squared_error(out_val, out_pred))
    #========================================================================
    
    if cv_score<best_score:
        print(f'''
#========================================================================
# CV SCORE AVG: {cv_score}
# OUT SCORE: {out_score}
#========================================================================''')
    
        best_score = cv_score
        best_score_list = use_cols
    
        #========================================================================
        # Save Stack
        utils.to_pkl_gzip(path=f"../stack/{start_time[4:12]}_stack_{model_type}_set-{set_type}_valid-{valid_type}-seed{fold_seed}_lgb{len(lgb_list)}_NN{is_nn}_ridge{is_rid}_ext{is_ext}_rmf{is_rmf}_OUT{str(out_score)[:7]}_CV{cv_score}_LB" , obj=df_pred[[key, 'prediction']])
        #========================================================================
sys.exit()
    
#========================================================================
# Submission
df_pred.set_index(key, inplace=True)
submit[target] = df_pred['prediction']
submit_path = f'../submit/{start_time[4:12]}_submit_{model_type}_set-{set_type}_lgb{len(lgb_list)}_NN{len(nn_list)}_other{len(other_list)}_OUT{str(out_score)[:7]}_CV{cv_score}_LB.csv'
submit.to_csv(submit_path, index=True)
display(submit.head())
#========================================================================

100%|██████████| 1/1 [00:00<00:00,  4.15it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Preparing dataset...


100%|██████████| 1/1 [00:00<00:00,  4.32it/s]


Unnamed: 0,card_id,target,first_active_month,hist_purchase_date_max,hist_purchase_month_max,hist_purchase_date_min,hist_purchase_month_min,new_purchase_date_max,new_purchase_month_max,new_purchase_date_min,new_purchase_month_min,hist_personal_term,new_personal_term,hist_regist_term,new_regist_term,no_out_flg,clf_pred,base_lgb_3-643008963792204,base_lgb_3-62411432795852,base_lgb_3-6416975493245176,base_lgb_3-633586049275204,base_lgb_3-6335126722555198,base_lgb_3-621674473874384,base_lgb_3-6453887328630556,base_lgb_3-6413337844431317,base_lgb_3-624886069699684,base_lgb_3-6297296108420443,base_lgb_3-6257781753926364,base_lgb_3-640200217753496,base_lgb_3-6221890278032856,base_lgb_3-2381280632566187,base_lgb_3-240003171265297,base_lgb_3-6431931116870206,base_lgb_3-6174270419359935,base_lgb_3-6467847619929894,base_lgb_3-6412766731028197,base_lgb_3-6437365731748232,base_lgb_3-640113854747483,base_lgb_3-6468062275569184,base_lgb_3-6316576670120693,base_lgb_3-645096827588883,base_lgb_3-651455934282057,base_lgb_3-6256427264251463,base_lgb_3-6246353407463157,base_lgb_3-626353491040934,base_lgb_3-631151477336348,base_lgb_3-625177466404422,base_lgb_3-6406494834590073,base_lgb_3-61968992617503,base_lgb_3-626619131871377,base_lgb_3-627555_LB3-675,base_lgb_3-6179960053791236_LB3-667,base_lgb_3-6236254858483243,base_lgb_3-646290595477581,base_lgb_3-6333204401002663,base_lgb_3-6226545066935465,base_lgb_3-638811137215022,base_lgb_3-653674740088933
0,C_ID_92a2005557,-0.820283,2017-06-01,2018-02-25 09:31:15,2018-03-01,2017-06-27 14:18:08,2017-06-01,2018-04-29 11:23:05,2018-05-01,2018-03-05 14:04:36,2018-03-01,9,2.0,9,11.0,1.0,0.000444,-0.533682,-0.38967,-0.310418,-0.292227,-0.321566,-0.349492,-0.204583,-0.293515,-0.346379,-0.346031,-0.367656,-0.316781,-0.360045,-0.403515,-0.344339,-0.262261,-0.342389,-0.36005,-0.284684,-0.325572,-0.151241,-0.229687,-0.481915,-0.18812,-0.257492,-0.405107,-0.348467,-0.270111,-0.293527,-0.373584,-0.324345,-0.337658,-0.367531,-0.349346,-0.360362,-0.344292,-0.375625,-0.331135,-0.340694,-0.332117,-0.181378
1,C_ID_3d0044924f,0.392913,2017-01-01,2018-01-31 22:31:09,2018-02-01,2017-01-06 16:29:42,2017-01-01,2018-03-30 06:48:26,2018-04-01,2018-02-01 17:07:54,2018-02-01,13,2.0,13,15.0,0.0,0.007831,-0.669546,-0.353189,-0.166621,-0.480098,-0.581695,-0.598341,-0.043758,-1.043,-0.624106,-0.55722,-0.728224,-0.308836,-0.314517,-1.136101,-0.955046,-0.586575,-0.924811,-0.492303,-0.553925,-0.516587,-0.371366,-0.58051,-0.070213,-0.336599,-0.052061,-0.108786,-0.746501,-0.567469,-0.322127,-0.309417,-0.70656,-0.471055,-0.547171,-0.62714,-0.560401,-0.349898,-0.909553,-0.48417,-0.675536,-0.643977,-0.652553
2,C_ID_d639edf6cd,0.688056,2016-08-01,2018-02-27 19:08:25,2018-03-01,2017-01-11 08:21:22,2017-01-01,2018-04-28 17:43:11,2018-05-01,2018-04-28 17:43:11,2018-04-01,14,1.0,19,18.0,0.0,0.004074,0.581285,0.749703,0.645878,0.865657,0.849526,0.655104,0.593162,0.564796,0.817565,0.915316,0.666341,0.586407,0.732969,0.744735,0.651148,0.610568,0.697131,0.670619,0.553163,0.692051,0.528412,0.643725,0.7801,0.634536,0.759758,0.795536,0.677786,0.697874,0.790826,0.5719,0.521493,0.722236,0.697877,0.648794,0.822205,0.803631,0.676389,0.642721,0.759212,0.518067,0.584152
3,C_ID_186d6a6901,0.142495,2017-09-01,2018-02-28 11:44:40,2018-03-01,2017-09-26 16:22:21,2017-09-01,2018-04-18 11:00:11,2018-05-01,2018-03-07 11:55:06,2018-03-01,6,2.0,6,8.0,0.0,0.000797,0.118323,0.18212,0.133039,0.068197,0.21469,0.147356,0.108424,0.008181,0.220217,0.122011,0.163632,0.143492,0.172772,0.132069,0.01283,0.040901,0.159377,-0.043269,0.165502,-0.085453,0.162732,0.15753,0.208539,0.147047,0.000617,0.176037,0.15374,0.133529,0.13149,0.150416,-0.005467,0.161512,0.169382,0.111918,0.143494,0.137893,-0.029571,0.108993,0.168531,0.188852,0.205327
4,C_ID_cdbd2c0db2,-0.159749,2017-11-01,2018-02-28 20:40:41,2018-03-01,2017-11-12 00:00:00,2017-11-01,2018-04-28 18:50:25,2018-05-01,2018-03-02 11:55:43,2018-03-01,4,2.0,4,6.0,1.0,0.000251,-0.282362,-0.113891,-0.279065,-0.137075,-0.137133,-0.079374,0.005718,-0.123009,-0.151363,-0.223672,-0.081562,-0.296002,-0.067805,-0.12654,-0.218226,-0.071952,-0.143816,-0.095347,-0.32847,-0.146626,-0.019034,-0.021277,-0.057492,0.014068,-0.180205,-0.07684,-0.078756,-0.186935,-0.224225,-0.116971,-0.112846,-0.120586,-0.133997,-0.182233,-0.142937,-0.120741,-0.136741,-0.106126,-0.087869,-0.297711,-0.205778


Train: (166425, 41) | Valid: (33285, 41) | Test: (123623, 41)
RMSE: 1.6457088435586191 | SUM ERROR: 10.424276553376927
Train: (166425, 41) | Valid: (33285, 41) | Test: (123623, 41)
RMSE: 1.6660232480884147 | SUM ERROR: 110.94950182931024
Train: (166425, 41) | Valid: (33285, 41) | Test: (123623, 41)
RMSE: 1.653513025194159 | SUM ERROR: -7.432921034676031
Train: (166425, 41) | Valid: (33285, 41) | Test: (123623, 41)
RMSE: 1.6510676947574945 | SUM ERROR: 47.6719593658283
Train: (166425, 41) | Valid: (33285, 41) | Test: (123623, 41)
RMSE: 1.6568440232613375 | SUM ERROR: -130.75758703747428
Train: (166425, 41) | Valid: (33285, 41) | Test: (123623, 41)
RMSE: 1.6578167698062196 | SUM ERROR: 15.132700679782147
Stacking Shape: (323333, 3)

# CV SCORE AVG: 1.655162267444374
# OUT SCORE: 0


SystemExit: 

In [38]:
train[use_cols]

Unnamed: 0,base_lgb_1-5486783250130114,base_lgb_1-5520199746035688,base_lgb_1-5520923703589151,base_lgb_1-553086826800966,base_lgb_1-5540396414459288
0,-0.318620,-0.305220,-0.311772,-0.341538,-0.265328
1,0.165438,0.446242,0.303839,0.050486,0.376686
2,0.995924,0.751869,0.714139,0.632040,0.716802
3,0.220188,0.130344,0.224406,0.254490,0.245714
4,-0.237574,-0.175082,-0.271558,-0.256391,-0.124312
5,-0.696671,-0.551457,-0.531222,-0.477508,-0.603180
6,-0.256917,-0.175884,-0.147559,-0.206967,-0.157910
7,-0.032199,0.126963,0.065085,0.075743,0.035525
8,0.312173,0.419844,0.612841,0.307862,0.245677
9,0.293921,0.266256,0.291410,0.259091,0.274604
