In [1]:
import os
import numpy as np 
import pandas as pd 
from subprocess import check_output
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

# Data Load

In [2]:
sub_path = "data/subs"
all_files = os.listdir(sub_path)
all_files

['blends',
 'submission-l2-R.csv',
 'submission_cnn.csv',
 'submission_per_type.csv',
 'submission_per_type_inflated.csv',
 'submission_type_important_features.csv',
 'submission_type_l1_different_lambda.csv',
 'submission_type_l1_without_fc.csv',
 'submission_type_l2.csv',
 'submission_type_testing.csv',
 'submit-00325000_model-larger.csv',
 'sub_qm9_babel_ascf_11fold_extreme_AND_stack_median_AVG.csv']

In [3]:
#list_to_remove = ['submission_type_testing.csv', 'submission_type_l1_without_fc.csv']
#selected_files = all_files[:9]
#selected_files = list(set(selected_files).difference(set(list_to_remove)))

selected_files = ['submit-00325000_model-larger.csv', 'sub_qm9_babel_ascf_11fold_extreme_AND_stack_median_AVG.csv']

In [4]:
# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in selected_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "champ" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
ncol = concat_sub.shape[1]
concat_sub.head()

  mask |= (ar1 == a)


Unnamed: 0,id,champ0,champ1
0,4658147,10.744632,15.893167
1,4658148,138.842834,194.245589
2,4658149,2.34201,7.17319
3,4658150,138.842834,189.5537
4,4658151,10.744632,15.460785


In [5]:
# check correlation
concat_sub.iloc[:,1:ncol].corr()

Unnamed: 0,champ0,champ1
champ0,1.0,0.999894
champ1,0.999894,1.0


In [6]:
# get the data fields ready for stacking
concat_sub['champ_max'] = concat_sub.iloc[:, 1:ncol].max(axis=1)
concat_sub['champ_min'] = concat_sub.iloc[:, 1:ncol].min(axis=1)
concat_sub['champ_mean'] = concat_sub.iloc[:, 1:ncol].mean(axis=1)
concat_sub['champ_median'] = concat_sub.iloc[:, 1:ncol].median(axis=1)

In [7]:
concat_sub.iloc[:, 1:ncol].describe()

Unnamed: 0,champ0,champ1
count,2505542.0,2505542.0
mean,15.8689,15.88382
std,34.82781,34.86016
min,-39.48567,-32.08316
25%,-0.231066,-0.2148092
50%,2.290012,2.277482
75%,7.322864,7.316506
max,203.6644,203.2502


In [13]:
cutoff_lo = -35
cutoff_hi = 207

# Mean Stacking

In [15]:
concat_sub['scalar_coupling_constant'] = concat_sub['champ_mean']
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_mean.csv', index=False)

**LB----**

# Median Stacking

In [9]:
concat_sub['scalar_coupling_constant'] = concat_sub['champ_median']
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_median_with_mpnn.csv', index=False)

**LB -1.609**

# PushOut + Median Stacking 

Pushout strategy is a bit agressive given what it does...

In [11]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 1, 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             0, concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_pushout_median.csv', index=False)

> **LB -----**

# MinMax + Mean Stacking

MinMax seems more gentle and it outperforms the previous one given its peformance score.

In [12]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_mean']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_mean.csv', index=False)

> **LB ----**



# MinMax + Median Stacking 

In [14]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_median.csv', index=False)

**LB ----** -

# MinMax + BestBase Stacking

In [14]:
# load the model with best base performance
sub_base = pd.read_csv('data/subs/submission_type_important_features.csv')

In [17]:
concat_sub['champ_base'] = sub_base['scalar_coupling_constant']
concat_sub['id'] = sub_base['id']
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_base']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_bestbase.csv', index=False)

> **LB----** -