In [1]:
debug = False
import gc
import re
import pandas as pd
import numpy as np
import os
import sys
import time
import datetime
import glob
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature, impute_feature
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

#========================================================================
# Keras 
# Corporación Favorita Grocery Sales Forecasting
from sklearn.linear_model import Ridge
#========================================================================


#========================================================================
# Args
out_part = ['', 'part', 'all'][0]
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg']
stack_name='ridge'
submit = pd.read_csv('../input/sample_submission.csv')
model_type='ridge'
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
seed = 328
#========================================================================


#========================================================================
# Data Load 
print("Preparing dataset...")
win_path = f'../features/4_winner/*.gz'
# Ensemble 1
win_path = f'../model/LB3670_70leaves_colsam0322/*.gz'
# Ensemble 2
# win_path = f'../model/E2_lift_set/*.gz'
# Ensemble 3
# win_path = f'../model/E3_PCA_set/*.gz'

win_path_list = glob.glob(win_path)

base = utils.read_df_pkl('../input/base_term*0*')[[key, target, 'first_active_month']]
base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
feature_list = utils.parallel_load_data(path_list=win_path_list)
df = pd.concat(feature_list, axis=1)
train = pd.concat([base_train, df.iloc[:len(base_train), :]], axis=1)
test = pd.concat([base_test, df.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)

train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True , drop=True)

if debug:
    train = train.head(10000)
    test = test.head(2000)
#========================================================================

#========================================================================
# 正規化の前処理(Null埋め, inf, -infの処理) 
for col in train.columns:
    if col in ignore_list: continue
        
    train[col] = impute_feature(train, col)
    test[col] = impute_feature(test, col)
#========================================================================

#========================================================================
# Cleansing Check
def clean_check(df, col):
#     if col in ignore_list: continue
#     train[col] = impute_feature(train, col)
#     test[col] = impute_feature(test, col)
    length = len(df)
    tmp = df[col].dropna().shape[0]
    if length - tmp>0:
        print(f"Null is {length-tmp}")
        
    inf_max = df[col].max()
    inf_min = df[col].min()
    inf_max_2 = df[col].sort_values().values[-1]
    inf_min_2 = df[col].sort_values().values[0]
    if inf_max==np.inf or inf_min==-np.inf:
        print(1, col, inf_max, inf_min)
    if inf_max_2==np.inf or inf_min_2==-np.inf:
        print(2, col, inf_max, inf_min)
#========================================================================

2019-02-11 08:36:54,142 utils 400 [INFO]    [logger_func] start 
100%|██████████| 3/3 [00:00<00:00, 45.95it/s]

Preparing dataset...





In [5]:
#========================================================================
# ods.ai 3rd kernel
# https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/78903
# KFold: n_splits=6(or 7)!, shuffle=False!
# train['rounded_target'] = train['target'].round(0)
# train = train.sort_values('rounded_target').reset_index(drop=True)
# vc = train['rounded_target'].value_counts()
# vc = dict(sorted(vc.items()))
# df = pd.DataFrame()
# train['indexcol'],idx = 0,1
# for k,v in vc.items():
#     step = train.shape[0]/v
#     indent = train.shape[0]/(v+1)
#     df2 = train[train['rounded_target'] == k].sample(v, random_state=seed).reset_index(drop=True)
#     for j in range(0, v):
#         df2.at[j, 'indexcol'] = indent + j*step + 0.000001*idx
#     df = pd.concat([df2,df])
#     idx+=1
# train = df.sort_values('indexcol', ascending=True).reset_index(drop=True)
# del train['indexcol'], train['rounded_target']
# fold_type = 'self'
# fold = 6
# folds = KFold(n_splits=fold, shuffle=False, random_state=seed)
# kfold = list(folds.split(train, train[target].values))
# utils.to_pkl_gzip(obj=kfold, path='../input/ods_kfold')
kfold = utils.read_pkl_gzip('../input/ods_kfold.gz')
# =======================================================================

#========================================================================
# CVの準備
model_list = []
result_list = []
score_list = []
val_pred_list = []
test_pred = np.zeros(len(test))

use_cols = [col for col in train.columns if col not in ignore_list]
scaler = StandardScaler()
scaler.fit(pd.concat([train[use_cols], test[use_cols]]))
x_test = scaler.transform(test[use_cols])

Y = train[target]
y_mean = Y.mean()
#========================================================================
    
print(f"Train: {train.shape} | Test: {test.shape}")
    
#========================================================================
# NN Model Setting 
fit_intercept = True
alpha = 0.4
max_iter = 1000
normalize = False
tol = 0.01
ridge = Ridge(solver='auto', fit_intercept=fit_intercept, alpha=alpha, max_iter=max_iter, normalize=normalize, tol=tol)
#========================================================================

#========================================================================
# Train & Prediction Start

for fold_no, (trn_idx, val_idx) in enumerate(kfold):

    #========================================================================
    # Make Dataset
#     X_train, X_val = train_test_split(train, test_size=0.2)
    X_train, y_train = train.iloc[trn_idx, :][use_cols], Y.iloc[trn_idx]
    X_val, y_val = train.iloc[val_idx, :][use_cols], Y.iloc[val_idx]
    
     
    X_train[:] = scaler.transform(X_train)
    X_val[:] = scaler.transform(X_val)
    X_train = X_train.as_matrix()
    X_val = X_val.as_matrix()
    #========================================================================
    
    # Fitting
    ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200, normalize=False, tol=0.01)
    ridge.fit(X_train, y_train)
    
    # Prediction
    y_pred = ridge.predict(X_val)
#     y_pred = y_pred.reshape(y_pred.shape[0], )
    test_pred = ridge.predict(x_test)
#     test_pred += tmp_pred.reshape(tmp_pred.shape[0], )
    
    # Stack Prediction
    df_pred = train.iloc[val_idx, :][[key, target]].copy()
    df_pred['prediction'] = y_pred
    result_list.append(df_pred)
    
    # Scoring
    err = (y_val - y_pred)
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f'RMSE: {score} | SUM ERROR: {err.sum()}')
    score_list.append(score)
    #========================================================================

cv_score = np.mean(score_list)
logger.info(f'''
#========================================================================
# CV SCORE AVG: {cv_score}
#========================================================================''')

#========================================================================
# Stacking
test_pred /= fold
test['prediction'] = test_pred
stack_test = test[[key, 'prediction']]
result_list.append(stack_test)
df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target, axis=1)
df_pred = base.merge(df_pred, how='inner', on=key)
print(f"Stacking Shape: {df_pred.shape}")

# Save Stack
utils.to_pkl_gzip(path=f"../stack/{start_time[4:12]}_stack_{model_type}_alpha{alpha}_{len(use_cols)}feats_{len(seed_list)}seed_tol{tol}_iter{max_iter}_OUT{str(out_score)[:7]}_CV{str(cv_score).replace('.', '-')}_LB"
                          , obj=df_pred)
#========================================================================

Train: (201917, 238) | Test: (123623, 238)
[ 0.99697211  0.78249097  0.14174135 -1.34486152  0.8852682 ]
[-1.02641299 -0.04536111 -1.84705051  0.10180674 -1.86672799]
RMSE: 4.011442136729887 | SUM ERROR: 100.5750275942226
[-0.03257652 -1.17443632 -1.06043177 -0.53119587  0.03396996]
[-1.05694313 -0.18659525 -2.02246871  0.12677646 -1.9036637 ]
RMSE: 3.761707809324424 | SUM ERROR: 169.26633058000203
[ 0.09131669  0.75997011 -3.17174433 -0.37193898  0.0429754 ]
[-1.12916361 -0.09940926 -1.8020578   0.15182906 -1.8671233 ]
RMSE: 3.745846640199936 | SUM ERROR: 33.42384167673712
[-0.6414107  -1.11844584  0.60120408 -2.70641091 -0.84972024]
[-0.88476002 -0.10663662 -1.78115175  0.09870754 -1.94257072]
RMSE: 3.751276391400474 | SUM ERROR: 128.90535382813457
[0.19849595 0.19404899 0.53667933 0.02031036 0.73416636]
[-0.9767019  -0.22361714 -1.94315387  0.11037092 -2.00735908]
RMSE: 3.746859865379433 | SUM ERROR: -393.1466454453067


2019-02-11 08:39:41,935 utils 104 [INFO]    [<module>] 
# CV SCORE AVG: 3.796164433554916


[ 0.16192962 -0.28737432  0.04328056 -0.09242657 -0.21318359]
[-1.07598326 -0.13462438 -1.67160833  0.21337728 -1.85160505]
RMSE: 3.7598537582953435 | SUM ERROR: 175.00139290675273
Stacking Shape: (325540, 4)


SystemExit: 