In [4]:
import os
import numpy as np 
import pandas as pd 
from subprocess import check_output
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

# Data Load

In [5]:
sub_path = "data/subs"
all_files = os.listdir(sub_path)
all_files

['blends',
 'different-models-for-types3.csv',
 'mpnn-ensemble-sagpool.csv',
 'sagpooling_larger_mpnn.csv',
 'semgcn_1c_w12_24k.csv',
 'submission_per_type_inflated.csv',
 'submission_type_important_features.csv',
 'submit-00325000_model-larger.csv',
 'submit-ensemble-JHC-all_types3.csv',
 'sub_qm9_babel_ascf_11fold_extreme_AND_stack_median_AVG.csv']

In [6]:
#list_to_remove = ['submission_type_testing.csv', 'submission_type_l1_without_fc.csv']
#selected_files = all_files[:9]
#selected_files = list(set(selected_files).difference(set(list_to_remove)))

selected_files = ['submit-ensemble-JHC-all_types3.csv', 'sub_qm9_babel_ascf_11fold_extreme_AND_stack_median_AVG.csv',
                  'semgcn_1c_w12_24k.csv', 'sagpooling_larger_mpnn.csv', ]

In [7]:
# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in selected_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "champ" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
ncol = concat_sub.shape[1]
concat_sub.head()

  mask |= (ar1 == a)


Unnamed: 0,id,champ0,champ1,champ2,champ3
0,4658147,8.040669,15.893167,12.576186,9.848858
1,4658148,127.524742,194.245589,155.951752,178.970093
2,4658149,2.34201,7.17319,3.237241,2.411096
3,4658150,127.524742,189.5537,155.951767,178.970093
4,4658151,8.040669,15.460785,12.576186,9.848858


In [8]:
# check correlation
concat_sub.iloc[:,1:ncol].corr()

Unnamed: 0,champ0,champ1,champ2,champ3
champ0,1.0,0.999901,0.999926,0.999937
champ1,0.999901,1.0,0.999913,0.999889
champ2,0.999926,0.999913,1.0,0.999916
champ3,0.999937,0.999889,0.999916,1.0


In [9]:
# get the data fields ready for stacking
concat_sub['champ_max'] = concat_sub.iloc[:, 1:ncol].max(axis=1)
concat_sub['champ_min'] = concat_sub.iloc[:, 1:ncol].min(axis=1)
concat_sub['champ_mean'] = concat_sub.iloc[:, 1:ncol].mean(axis=1)
concat_sub['champ_median'] = concat_sub.iloc[:, 1:ncol].median(axis=1)

In [10]:
concat_sub.iloc[:, 1:ncol].describe()

Unnamed: 0,champ0,champ1,champ2,champ3
count,2505542.0,2505542.0,2505542.0,2505542.0
mean,15.87813,15.88382,15.89216,15.86508
std,34.85898,34.86016,34.8696,34.80879
min,-41.77213,-32.08316,-32.78223,-36.60491
25%,-0.2393832,-0.2148092,-0.2406806,-0.2448642
50%,2.289008,2.277482,2.277586,2.284614
75%,7.327051,7.316506,7.345392,7.351202
max,204.3061,203.2502,207.1149,203.5825


In [11]:
cutoff_lo = -35
cutoff_hi = 207

# Mean Stacking

In [15]:
concat_sub['scalar_coupling_constant'] = concat_sub['champ_mean']
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_mean.csv', index=False)

**LB----**

# Median Stacking

In [12]:
concat_sub['scalar_coupling_constant'] = concat_sub['champ_median']
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_median_lgbm_mpnn_semgcn.csv', index=False)

**LB -1.901**

# PushOut + Median Stacking 

Pushout strategy is a bit agressive given what it does...

In [11]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 1, 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             0, concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_pushout_median.csv', index=False)

> **LB -----**

# MinMax + Mean Stacking

MinMax seems more gentle and it outperforms the previous one given its peformance score.

In [12]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_mean']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_mean.csv', index=False)

> **LB ----**



# MinMax + Median Stacking 

In [14]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_median.csv', index=False)

**LB ----** -

# MinMax + BestBase Stacking

In [14]:
# load the model with best base performance
sub_base = pd.read_csv('data/subs/submission_type_important_features.csv')

In [17]:
concat_sub['champ_base'] = sub_base['scalar_coupling_constant']
concat_sub['id'] = sub_base['id']
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_base']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_bestbase.csv', index=False)

> **LB----** -