In [1]:
import pandas as pd
import numpy as np
from IPython.core.display import display

In [2]:
# Load the out-of-fold prediction of each Level 1 model

nn_3_layer_v1_stacked = pd.read_csv('../3_layer_v1/result/stacked_preds.csv')
nn_4_layer_v1_stacked = pd.read_csv('../4_layer_v1/result/stacked_preds.csv')
xgb_v2_stacked = pd.read_csv('../xgb/xgb_v2/result/stacked_preds.csv')
xgb_v3_stacked = pd.read_csv('../xgb/xgb_v3/result/stacked_preds.csv')

In [3]:
# Create features for a Level 2 model

df_stacked = pd.DataFrame()
df_stacked['3_layer_v1'] = nn_3_layer_v1_stacked['3_layer_v1']
df_stacked['4_layer_v1'] = nn_4_layer_v1_stacked['4_layer_1']
df_stacked['xgb_v2'] = xgb_v2_stacked['xgb_param#1_best_num_round']
df_stacked['xgb_v3'] = xgb_v3_stacked['xgb_v3']
display(df_stacked.head())

Unnamed: 0,3_layer_v1,4_layer_v1,xgb_v2,xgb_v3
0,1880.244751,1777.215576,2020.95752,1935.697754
1,1478.206909,1415.820068,1762.734131,1759.89563
2,4049.240479,3789.947998,4593.607422,4663.526855
3,961.461609,973.07666,1008.679077,990.860962
4,3311.209717,3240.973145,3080.286133,3099.009277


In [5]:
# Check the cross validation score of some Level 1 models

print(pd.read_csv('../4_layer_v1/result/cross_validation.csv')['val_loss'].mean())
print(pd.read_csv('../xgb/xgb_v3/result/cross_validation.csv')['val_mae'].mean())

1161.829308437405
1133.5405813130944


In [6]:
# Prepare the target value loss for a Level 2 model

df_train = pd.read_csv('../data/train.csv')
y = df_train['loss']
print(y.shape)

(188318,)


## Level 2 model: Linear Regression

In [7]:
# L2 model: Linear Regression
# 5-Fold cross validation 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# transformation for linear regression
shift = 200
X = df_stacked.values
X_normed = np.log(X + shift)
y_normed = np.log(y + shift)

df_cross_validation = pd.DataFrame()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

for i, (train_index, test_index) in enumerate(kf.split(X_normed), start=1):
    X_train, X_val = X_normed[train_index], X_normed[test_index]
    y_train, y_val = y_normed[train_index], y_normed[test_index]
    
    linear = LinearRegression()
    linear.fit(X_train, y_train)
    y_pred = linear.predict(X_val)
    df_cross_validation.loc[i, 'val_mae'] = mean_absolute_error(np.exp(y_val), np.exp(y_pred))

display(df_cross_validation.T)
print('mean MAE on CV = ', df_cross_validation['val_mae'].mean())

Unnamed: 0,1,2,3,4,5
val_mae,1135.742358,1126.563151,1141.394212,1122.388722,1125.811134


mean MAE on CV =  1130.3799154373862


## Level 2 model: Linear Regression Predicts for Test data

In [9]:
# Load the temporal preds of each level 1 model

nn_3_layer_v1_tmp = pd.read_csv('../3_layer_v1/result/temporal_preds.csv')
nn_4_layer_v1_tmp = pd.read_csv('../4_layer_v1/result/temporal_preds.csv')
xgb_v2_tmp = pd.read_csv('../xgb/xgb_v2/result/temporal_preds.csv')
xgb_v3_tmp = pd.read_csv('../xgb/xgb_v3/result/temporal_preds.csv')

In [17]:
# Average the temporal preds

folds = nn_3_layer_v1_tmp.columns[1:].values

X_l1_test = pd.DataFrame()
X_l1_test['3_layer_v1'] = nn_3_layer_v1_tmp[folds].mean(axis=1)
X_l1_test['4_layer_v1'] = nn_4_layer_v1_tmp[folds].mean(axis=1)
X_l1_test['xgb_v2'] = xgb_v2_tmp[folds].mean(axis=1)
X_l1_test['xgb_v3'] = xgb_v3_tmp[folds].mean(axis=1)

display(X_l1_test.head())

Unnamed: 0,3_layer_v1,4_layer_v1,xgb_v2,xgb_v3
0,1322.473901,1403.631567,1486.560693,1463.680933
1,1689.630396,1693.291626,1989.209131,2014.644678
2,9212.710938,9132.435352,9519.208398,9523.435352
3,7582.271191,7518.699609,5942.946973,5828.549316
4,869.603357,851.640259,818.550183,822.485779


In [18]:
# Train the level 2 model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# transformation for linear regression
shift = 200
X = df_stacked.values
X_normed = np.log(X + shift)
y_normed = np.log(y + shift)
X_train = X_normed
y_train = y_normed

l2_linear = LinearRegression()
l2_linear.fit(X_train, y_train)

print('Coef: ', l2_linear.coef_)
print('Intercept = ', l2_linear.intercept_)

Coef:  [ 0.06441971  0.05215514  0.75735988  0.1179668 ]
Intercept =  0.0650383773344


In [None]:
# Level 2 model prediction
y_pred = l2_linear.predict(X_l1_test.values)
print(y_pred.shape)

In [54]:
# CSV export
display(y_pred[:5])
df_sample_submission = pd.read_csv('../data/sample_submission.csv')
df_sample_submission['loss'] = y_pred
df_sample_submission.to_csv('./result/submission.csv', index=False)

array([ 1456.99221562,  1941.43278329,  9402.76442125,  6069.17653924,
         817.46513978])