# Weighted Average example

Blending is the best way to explore diversity from models.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pathlib import Path
print(os.listdir('data/subs'))

['blends', 'submission-l2-R.csv', 'submission_cnn.csv', 'submission_per_type.csv', 'submission_per_type_inflated.csv', 'submission_type_important_features.csv', 'submission_type_l1_different_lambda.csv', 'submission_type_l1_without_fc.csv', 'submission_type_l2.csv', 'submission_type_testing.csv']


In [2]:
sub_path = Path('data/subs/')

## first try

In [3]:
sub1 = pd.read_csv(sub_path/'submission_type_important_features.csv')
sub2 = pd.read_csv(sub_path/'submission-l2-R.csv')
sub3 = pd.read_csv(sub_path/'submission_cnn.csv')
print(sub1['scalar_coupling_constant'].describe())
print(sub2['scalar_coupling_constant'].describe())
print(sub3['scalar_coupling_constant'].describe())

count    2.505542e+06
mean     1.588510e+01
std      3.486513e+01
min     -3.155304e+01
25%     -2.290635e-01
50%      2.276754e+00
75%      7.343286e+00
max      2.032151e+02
Name: scalar_coupling_constant, dtype: float64
count    2.505542e+06
mean     1.588675e+01
std      3.485679e+01
min     -3.224456e+01
25%     -2.086306e-01
50%      2.305326e+00
75%      7.255698e+00
max      2.032142e+02
Name: scalar_coupling_constant, dtype: float64
count    2.505542e+06
mean     1.587349e+01
std      3.482888e+01
min     -3.230405e+01
25%     -2.472747e-01
50%      2.277905e+00
75%      7.360947e+00
max      2.032168e+02
Name: scalar_coupling_constant, dtype: float64


In [24]:
#Mean absolute difference
print((sub1['scalar_coupling_constant'] - sub2['scalar_coupling_constant']).abs().mean())
print((sub2['scalar_coupling_constant'] - sub3['scalar_coupling_constant']).abs().mean())
print((sub1['scalar_coupling_constant'] - sub3['scalar_coupling_constant']).abs().mean())

0.3597435773909075
0.5313792224301376
0.47055922235984465


In [28]:
#only one better so far: 0.7*sub1 + 0.2*sub2 + 0.1*sub3

In [27]:
sub1['scalar_coupling_constant'] = 0.7*sub1['scalar_coupling_constant'] + 0.2*sub2['scalar_coupling_constant'] + \
                                   0.1*sub3['scalar_coupling_constant']
sub1.to_csv(sub_path/'weighted-avg-blend-2.csv', index=False )
sub1['scalar_coupling_constant'].describe()

count    2.505542e+06
mean     1.588553e+01
std      3.485639e+01
min     -3.201029e+01
25%     -2.069837e-01
50%      2.292249e+00
75%      7.282941e+00
max      2.031252e+02
Name: scalar_coupling_constant, dtype: float64

## first try, with lightgbm and external submissions

In [3]:
sub1 = pd.read_csv(sub_path/'submission_type_important_features.csv')
sub2 = pd.read_csv(sub_path/'submission-l2-R.csv')
sub3 = pd.read_csv(sub_path/'submission_cnn.csv')
print(sub1['scalar_coupling_constant'].describe())
print(sub2['scalar_coupling_constant'].describe())
print(sub3['scalar_coupling_constant'].describe())

count    2.505542e+06
mean     1.588510e+01
std      3.486513e+01
min     -3.155304e+01
25%     -2.290635e-01
50%      2.276754e+00
75%      7.343286e+00
max      2.032151e+02
Name: scalar_coupling_constant, dtype: float64
count    2.505542e+06
mean     1.588675e+01
std      3.485679e+01
min     -3.224456e+01
25%     -2.086306e-01
50%      2.305326e+00
75%      7.255698e+00
max      2.032142e+02
Name: scalar_coupling_constant, dtype: float64
count    2.505542e+06
mean     1.587349e+01
std      3.482888e+01
min     -3.230405e+01
25%     -2.472747e-01
50%      2.277905e+00
75%      7.360947e+00
max      2.032168e+02
Name: scalar_coupling_constant, dtype: float64


In [24]:
#Mean absolute difference
print((sub1['scalar_coupling_constant'] - sub2['scalar_coupling_constant']).abs().mean())
print((sub2['scalar_coupling_constant'] - sub3['scalar_coupling_constant']).abs().mean())
print((sub1['scalar_coupling_constant'] - sub3['scalar_coupling_constant']).abs().mean())

0.3597435773909075
0.5313792224301376
0.47055922235984465


In [28]:
#only one better so far: 0.7*sub1 + 0.2*sub2 + 0.1*sub3

In [27]:
sub1['scalar_coupling_constant'] = 0.7*sub1['scalar_coupling_constant'] + 0.2*sub2['scalar_coupling_constant'] + \
                                   0.1*sub3['scalar_coupling_constant']
sub1.to_csv(sub_path/'weighted-avg-blend-2.csv', index=False )
sub1['scalar_coupling_constant'].describe()

count    2.505542e+06
mean     1.588553e+01
std      3.485639e+01
min     -3.201029e+01
25%     -2.069837e-01
50%      2.292249e+00
75%      7.282941e+00
max      2.031252e+02
Name: scalar_coupling_constant, dtype: float64

## second try, with mpnn and lightgbm

In [133]:
mpnn_path = Path('graph_cnn/data/submission/zzz/submit')

In [134]:
sub1 = pd.read_csv(mpnn_path/'submit-00197500_model-larger.csv')
sub2 = pd.read_csv(sub_path/'submission_type_important_features.csv')
#sub3 = pd.read_csv(sub_path/'weighted-avg-blend-2.csv')
print(sub1['scalar_coupling_constant'].describe())
print(sub2['scalar_coupling_constant'].describe())
#print(sub3['scalar_coupling_constant'].describe())

count    2.505542e+06
mean     1.588668e+01
std      3.485453e+01
min     -2.778079e+01
25%     -2.372454e-01
50%      2.301130e+00
75%      7.332769e+00
max      2.044768e+02
Name: scalar_coupling_constant, dtype: float64
count    2.505542e+06
mean     1.588510e+01
std      3.486513e+01
min     -3.155304e+01
25%     -2.290635e-01
50%      2.276754e+00
75%      7.343286e+00
max      2.032151e+02
Name: scalar_coupling_constant, dtype: float64


In [135]:
test = pd.read_csv('data/test.csv')

In [136]:
#                   '1JHC',  '2JHC',  '3JHC',  '1JHN',  '2JHH',  '3JHH',  '2JHN',  '3JHN'

# mpnn          |  -0.702,  -1.431,  -1.122,  -0.648,  -1.683,  -1.761,  -1.625,  -1.467   |
# lightgbm      |  -0.3981, -1.2033, -0.9857, -0.9291, -1.7055, -1.5529, -1.6048, -1.8029  |

In [137]:
# better type in mpnn: '1JHC', '2JHC', '3JHC', '3JHH', '2JHN'

In [138]:
#['1JHC', '2JHC', '3JHC', '1JHN', '2JHH', '3JHH', '2JHN', '3JHN']

#[0, 3, 1, 4, 2, 6, 5, 7] original types encoded
#[0, 2, 5, 1, 3, 6, 4, 7] correctly sorted encoded

In [139]:
type_dict = {'1JHC':0, '2JHC':2, '3JHC':5, '1JHN':1, '2JHH':3, '3JHH':6, '2JHN':4, '3JHN':7}

In [140]:
mpnn_types = ['1JHC', '2JHC', '3JHH']

In [141]:
sub_final = sub2.copy()

In [142]:
for i in test.index:
    if i % 100000 == 0: print(i) 
    test_type = test.at[i,'type']
    if test_type in mpnn_types:
        sub_final.at[i,'scalar_coupling_constant'] = sub1.at[i, 'scalar_coupling_constant']

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000


In [143]:
sub_final['scalar_coupling_constant'].describe()

count    2.505542e+06
mean     1.587802e+01
std      3.485644e+01
min     -2.875114e+01
25%     -2.346990e-01
50%      2.279296e+00
75%      7.345073e+00
max      2.044768e+02
Name: scalar_coupling_constant, dtype: float64

In [144]:
sub_final.to_csv(sub_path/'blends/different-models-for-types3.csv', index=False)

In [42]:
#sub1['scalar_coupling_constant'] = 0.7*sub1['scalar_coupling_constant'] + 0.2*sub2['scalar_coupling_constant']
#sub1.to_csv(sub_path/'weighted-avg-blend-2.csv', index=False)
#sub1['scalar_coupling_constant'].describe()