Thanks @DSEverything for https://www.kaggle.com/dongxu027/explore-stacking-lb-0-1463

In [1]:
import os
import numpy as np 
import pandas as pd 
from subprocess import check_output
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

# Data Load

In [2]:
sub_path = "data/subs"
all_files = os.listdir(sub_path)
all_files

['submission-l2-R.csv',
 'submission_cnn.csv',
 'submission_per_type.csv',
 'submission_per_type_inflated.csv',
 'submission_type_important_features.csv',
 'submission_type_l1_different_lambda.csv',
 'submission_type_l1_without_fc.csv',
 'submission_type_l2.csv',
 'submission_type_testing.csv',
 'weighted-avg-blend-1.csv',
 'weighted-avg-blend-2.csv',
 'weighted-avg-blend-3.csv']

In [3]:
list_to_remove = ['submission_type_testing.csv', 'submission_type_l1_without_fc.csv']
selected_files = all_files[:9]
selected_files = list(set(selected_files).difference(set(list_to_remove)))
selected_files

['submission-l2-R.csv',
 'submission_per_type_inflated.csv',
 'submission_type_l1_different_lambda.csv',
 'submission_type_l2.csv',
 'submission_per_type.csv',
 'submission_type_important_features.csv',
 'submission_cnn.csv']

In [9]:
# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in selected_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "champ" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
ncol = concat_sub.shape[1]
concat_sub.head()

Unnamed: 0,id,champ0,champ1,champ2,champ3,champ4,champ5,champ6
0,4658147,9.946665,13.409721,16.566085,16.857134,9.373112,17.300541,9.397905
1,4658148,147.101689,195.441915,197.012499,193.993892,155.638529,194.840166,175.418076
2,4658149,2.331709,5.799049,5.221424,5.56165,5.528845,5.606836,5.084437
3,4658150,150.967498,192.220371,190.912648,187.995414,156.410454,189.396811,176.916229
4,4658151,12.678644,6.345233,16.827829,15.808855,8.042887,17.137294,9.667832


In [10]:
# check correlation
concat_sub.iloc[:,1:ncol].corr()

Unnamed: 0,champ0,champ1,champ2,champ3,champ4,champ5,champ6
champ0,1.0,0.999733,0.999839,0.999846,0.999624,0.999843,0.999525
champ1,0.999733,1.0,0.999748,0.999753,0.999706,0.99975,0.999523
champ2,0.999839,0.999748,1.0,0.999974,0.999563,0.999973,0.999561
champ3,0.999846,0.999753,0.999974,1.0,0.99957,0.999987,0.999566
champ4,0.999624,0.999706,0.999563,0.99957,1.0,0.999567,0.999419
champ5,0.999843,0.99975,0.999973,0.999987,0.999567,1.0,0.999565
champ6,0.999525,0.999523,0.999561,0.999566,0.999419,0.999565,1.0


In [11]:
# get the data fields ready for stacking
concat_sub['champ_max'] = concat_sub.iloc[:, 1:ncol].max(axis=1)
concat_sub['champ_min'] = concat_sub.iloc[:, 1:ncol].min(axis=1)
concat_sub['champ_mean'] = concat_sub.iloc[:, 1:ncol].mean(axis=1)
concat_sub['champ_median'] = concat_sub.iloc[:, 1:ncol].median(axis=1)

In [12]:
concat_sub.iloc[:, 1:ncol].describe()

Unnamed: 0,champ0,champ1,champ2,champ3,champ4,champ5,champ6
count,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0
mean,15.88675,15.88474,15.8854,15.88493,15.88759,15.8851,15.87349
std,34.85679,34.85558,34.86563,34.86503,34.84533,34.86513,34.82888
min,-32.24456,-31.93704,-31.5725,-31.55207,-30.79447,-31.55304,-32.30405
25%,-0.2086306,-0.1986067,-0.2285737,-0.2283203,-0.1752141,-0.2290635,-0.2472747
50%,2.305326,2.271676,2.277338,2.277121,2.295203,2.276754,2.277905
75%,7.255698,7.258927,7.343182,7.340137,7.161042,7.343286,7.360947
max,203.2142,203.1041,203.5224,203.5096,204.1285,203.2151,203.2168


In [13]:
cutoff_lo = -35
cutoff_hi = 207

# Mean Stacking

In [15]:
concat_sub['scalar_coupling_constant'] = concat_sub['champ_mean']
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_mean.csv', index=False)

**LB----**

# Median Stacking

In [17]:
concat_sub['scalar_coupling_constant'] = concat_sub['champ_median']
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_median.csv', index=False)

**LB -1.609**

# PushOut + Median Stacking 

Pushout strategy is a bit agressive given what it does...

In [11]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 1, 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             0, concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_pushout_median.csv', index=False)

> **LB -----**

# MinMax + Mean Stacking

MinMax seems more gentle and it outperforms the previous one given its peformance score.

In [12]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_mean']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_mean.csv', index=False)

> **LB ----**



# MinMax + Median Stacking 

In [14]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_median.csv', index=False)

**LB ----** -

# MinMax + BestBase Stacking

In [14]:
# load the model with best base performance
sub_base = pd.read_csv('data/subs/submission_type_important_features.csv')

In [17]:
concat_sub['champ_base'] = sub_base['scalar_coupling_constant']
concat_sub['id'] = sub_base['id']
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_base']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_bestbase.csv', index=False)

> **LB----** -