In [61]:
import os
import numpy as np 
import pandas as pd 
from subprocess import check_output
import matplotlib.pyplot as plt

# Data Load

In [62]:
sub_path = "data/subs/team_subs"
all_files = os.listdir(sub_path)
all_files

['4_best_median_stack_updated.csv',
 '5_best_median_stack_updated.csv',
 'bestpublic.csv',
 'best_model_per_type.csv',
 'best_model_per_type_lgbm_mpnn_updated.csv',
 'best_single_models_with_2080_stack.csv',
 'lgbm_1956.csv',
 'lgbm_best.csv',
 'lgbm_final.csv',
 'lgbm_pseudo_no_2JHC_3JHC.csv',
 'mpnn_5_checkpoints_1933.csv',
 'mpnn_multiple_checkpoints_pseudo.csv',
 'mpnn_multiple_checkpoints_updated.csv',
 'NN_final.csv',
 'NN_pseudo_no_2JHC_3JHC.csv',
 'semgcn_1c_w12_24k.csv',
 'semgcn_1c_w12_24k_separated.csv',
 'singlemodels_scgnmpnn_2302.csv',
 'stack_median_best_4_submissions.csv',
 'stack_median_lgbm_mpnn_semgcn.csv',
 'stack_median_lgbm_mpnn_semgcn_updated_2080.csv',
 'weighted_average_lgbm_semgcn_mpnn_2001.csv']

In [63]:
selected_files = ['best_single_models_with_2080_stack.csv',
                  'lgbm_final.csv',
                  'NN_final.csv',
                  'mpnn_multiple_checkpoints_pseudo.csv'] 

In [64]:
# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in selected_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "champ" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
ncol = concat_sub.shape[1]
concat_sub.head()

Unnamed: 0,id,champ0,champ1,champ2,champ3
0,4658147,9.850896,13.068462,-3.612952,9.660213
1,4658148,174.456394,193.281711,171.926682,178.438538
2,4658149,3.025961,4.732296,-9.594178,2.044971
3,4658150,172.141953,193.335145,170.549744,175.999878
4,4658151,10.297093,13.102951,-3.612982,9.387045


In [65]:
# check correlation
concat_sub.iloc[:,1:ncol].corr()

Unnamed: 0,champ0,champ1,champ2,champ3
champ0,1.0,0.999997,0.99991,0.999985
champ1,0.999997,1.0,0.999908,0.999981
champ2,0.99991,0.999908,1.0,0.999893
champ3,0.999985,0.999981,0.999893,1.0


In [66]:
# get the data fields ready for stacking
concat_sub['champ_max'] = concat_sub.iloc[:, 1:ncol].max(axis=1)
concat_sub['champ_min'] = concat_sub.iloc[:, 1:ncol].min(axis=1)
concat_sub['champ_mean'] = concat_sub.iloc[:, 1:ncol].mean(axis=1)
concat_sub['champ_median'] = concat_sub.iloc[:, 1:ncol].median(axis=1)

In [67]:
concat_sub.iloc[:, 1:ncol].describe()

Unnamed: 0,champ0,champ1,champ2,champ3
count,2505542.0,2505542.0,2505542.0,2505542.0
mean,15.88334,15.88621,15.88313,15.87908
std,34.86591,34.87205,34.8736,34.85419
min,-33.88117,-33.9032,-34.22949,-37.8319
25%,-0.2410323,-0.2413528,-0.2477051,-0.2420567
50%,2.275754,2.278306,2.270318,2.280677
75%,7.352585,7.354508,7.345815,7.351396
max,204.61,203.5005,700.2148,204.4298


In [68]:
cutoff_lo = -35
cutoff_hi = 207

# Median Stacking

In [70]:
concat_sub['scalar_coupling_constant'] = concat_sub['champ_median']
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/best_subs3.csv', index=False)

**LB -2.170**

# PushOut + Median Stacking 

Pushout strategy is a bit agressive given what it does...

In [11]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 1, 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             0, concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_pushout_median.csv', index=False)

> **LB -----**

# MinMax + Mean Stacking

MinMax seems more gentle and it outperforms the previous one given its peformance score.

In [12]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_mean']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_mean.csv', index=False)

> **LB ----**



# MinMax + Median Stacking 

In [14]:
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_median']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_median.csv', index=False)

**LB ----** -

# MinMax + BestBase Stacking

In [14]:
# load the model with best base performance
sub_base = pd.read_csv('data/subs/submission_type_important_features.csv')

In [17]:
concat_sub['champ_base'] = sub_base['scalar_coupling_constant']
concat_sub['id'] = sub_base['id']
concat_sub['scalar_coupling_constant'] = np.where(np.all(concat_sub.iloc[:,1:ncol] > cutoff_lo, axis=1), 
                                    concat_sub['champ_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:ncol] < cutoff_hi, axis=1),
                                             concat_sub['champ_min'], 
                                             concat_sub['champ_base']))
concat_sub[['id', 'scalar_coupling_constant']].to_csv('data/subs/blends/stack_minmax_bestbase.csv', index=False)

> **LB----** -

### Weighted Average

In [62]:
selected_files = ['sub_qm9_babel_ascf_11fold_extreme_AND_stack_median_AVG.csv',
                  'semgcn_1c_w12_24k.csv', 'sagpooling_larger_mpnn.csv', ]

In [63]:
sub_lgbm = pd.read_csv(os.path.join(sub_path, selected_files[0]), index_col=0)
sub_semgcn = pd.read_csv(os.path.join(sub_path, selected_files[1]), index_col=0)
sub_mpnn = pd.read_csv(os.path.join(sub_path, selected_files[2]), index_col=0)

In [69]:
cc = sub_semgcn.copy()

In [76]:
cc['scalar_coupling_constant'] = (0.2*sub_lgbm['scalar_coupling_constant'] + 0.5*sub_semgcn['scalar_coupling_constant']
                                + 0.3*sub_mpnn['scalar_coupling_constant'])

In [77]:
print(cc['scalar_coupling_constant'].mean()-sub_lgbm['scalar_coupling_constant'].mean())
print(cc['scalar_coupling_constant'].mean()-sub_semgcn['scalar_coupling_constant'].mean())
print(cc['scalar_coupling_constant'].mean()-sub_mpnn['scalar_coupling_constant'].mean())

-0.0014508007425977354
-0.00979137145056086
0.0172861529099535


In [79]:
cc.to_csv('weighted_average_lgbm_semgcn_mpnn.csv')

In [80]:
cc.head()

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
4658147,12.421384
4658148,170.516021
4658149,3.776587
4658150,169.577651
4658151,12.334907
