In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os.path as osp
from tqdm.autonotebook import tqdm
from glob import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GroupKFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import HuberRegressor



# Load and preprocessing data

## Load data

In [3]:
script_dir = osp.abspath(os.path.dirname(__file__))
DATA_DIR = script_dir + '../data/'

In [4]:
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

## Split train valid

In [5]:
molecules = train.molecule_name.drop_duplicates().sort_values()

In [6]:
train_ind, valid_ind = train_test_split(np.arange(len(molecules)),
                                        test_size=5000,
                                        random_state=1234)

## Create train valid subset

In [7]:
train_data = train.loc[train.molecule_name.isin(molecules.iloc[train_ind])]
val_data = train.loc[train.molecule_name.isin(molecules.iloc[valid_ind])]

In [8]:
val_data.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
1582,1582,dsgdb9nsd_000098,5,1,2JHC,-1.05744
1583,1583,dsgdb9nsd_000098,5,2,3JHC,0.474094
1584,1584,dsgdb9nsd_000098,5,4,3JHC,0.473745
1585,1585,dsgdb9nsd_000098,5,6,3JHH,14.3696
1586,1586,dsgdb9nsd_000098,6,1,1JHC,92.7858


# Load pred

In [9]:
glob(script_dir + '/guillaume/train*.csv')

['guillaume/train_9ZB-015-link-edges-lowerlr.256000.csv',
 'guillaume/train_9ZG4A-000-vanilla-deep-radam-droplr-2.518044.csv',
 'guillaume/train_9ZF3-004-ablation-study-remove-global-state-5-droplr.275100.csv',
 'guillaume/train_9ZF2-005-ablation-study-high-batch-size-augment-really-this-time-4-droplr.278759.csv',
 'guillaume/train_9ZB2-003-link-edges-4xlowerlr-2-droplr.634290.csv',
 'guillaume/train_9ZG4A-001-vanilla-deep-radam-droplr.515360.csv',
 'guillaume/train_9ZF5-004-ablation-study-no-pairs-embeddings-and-one-preprocessing-edge-pairs-5-droplr.303008.csv',
 'guillaume/train_9ZF-002-ablation-study-high-batch-size-3-droplr.593979.csv']

In [10]:
def create_list_prediction(folder_path, text_find, text_replace):
    sub_val = []
    sub_test = []
    for sub_val_path in glob(folder_path):
        sub_val.append(sub_val_path)
        sub_test.append(sub_val_path.replace(text_find, text_replace))
    
    return sub_val, sub_test

In [11]:
def load_pred_guillaume(path):
    df = pd.read_csv(path)
    return df.loc[df.dataset=='valid', ['id', 'prediction']].set_index('id')

In [12]:
def load_pred_lam(path):
    return pd.read_csv(path, index_col=0)

In [13]:
def load_pred_thanhtu(path):
    return pd.read_csv(path, index_col=0)

In [14]:
preds_val = pd.concat([
    *[load_pred_guillaume(path) for path in create_list_prediction(script_dir + '/guillaume/train*.csv', 'train', 'submission')[0]],
    *[load_pred_lam(path) for path in create_list_prediction(script_dir + '/lam_01_v1/pred*.csv', 'pred', 'sub')[0]],
    *[load_pred_thanhtu(path) for path in create_list_prediction(script_dir + '/thanhtu/valid*.csv', 'valid', 'sub')[0]]
], axis=1)

In [15]:
preds_test = pd.concat([
    *[load_pred_lam(path) for path in create_list_prediction(script_dir + '/guillaume/train*.csv', 'train', 'submission')[1]],
    *[load_pred_lam(path) for path in create_list_prediction(script_dir + '/lam_01_v1/pred*.csv', 'pred_val', 'sub')[1]],
    *[load_pred_thanhtu(path) for path in create_list_prediction(script_dir + '/thanhtu/valid*.csv', 'valid', 'sub')[1]]
], axis=1)

  mask |= (ar1 == a)


In [16]:
preds_val.shape, preds_test.shape

((275505, 14), (2505542, 14))

# Stacking XGB

In [19]:
preds = preds_val.loc[val_data['id']]
val_xgb = val_data.copy()
val_xgb['scalar_coupling_constant'] = 0
val_xgb = val_xgb.reset_index()
X = preds.values
y = val_data.scalar_coupling_constant.values
test_pred_xgb = test.copy()
test_pred_xgb['scalar_coupling_constant'] = 0
test_pred_xgb = test_pred_xgb.reset_index()
Xtest = preds_test.loc[test['id']].values
groups = val_xgb.molecule_name.astype('category').cat.codes.values
# types = val.type.astype('category').cat.codes.values

cv = GroupKFold(20)
for train_index, val_index in cv.split(X, y, groups):
    clf = LGBMRegressor(
        num_leaves=60,
        max_depth=-1,
        learning_rate=0.01,
        n_estimators=20000,
        subsample_for_bin=200000,
        objective='regression_l1',
        reg_alpha=0.0,
        random_state=None,
        n_jobs=12,
        silent=False,
        )
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
#     type_train, type_val = types[train_index], types[val_index]

    clf.fit(
            np.concatenate([
                X_train - X_train.mean(axis=1).reshape(-1, 1),
                X_train, 
#                 type_train.reshape(-1, 1)
            ], axis = 1
            ),
            y_train - X_train.mean(axis = 1),
            eval_set = [
                (
                    np.concatenate(
                    [
                        X_val - X_val.mean(axis = 1).reshape(-1, 1),
                        X_val,
#                         type_val.reshape(-1, 1)
                    ], axis = 1
                    ),
                    y_val - X_val.mean(axis = 1)
                )
            ],
            eval_metric = ['mae'],
            early_stopping_rounds = 10,
            verbose = 100
    )
    res_lgbm = clf.predict(np.concatenate([X_val - X_val.mean(axis = 1).reshape(-1, 1),
                                           X_val,
#                                            type_val.reshape(-1, 1)
                                          ], axis = 1)
                                           ) + X_val.mean(axis = 1)

    res_test = clf.predict(np.concatenate([
                                           Xtest - Xtest.mean(axis = 1).reshape(-1, 1),
                                           Xtest,
#                                            type_test.reshape(-1, 1)
                                        ], axis = 1)) + Xtest.mean(axis = 1)
    
    print(np.log(np.abs(res_lgbm - y_val).mean()))
    val_xgb.loc[val_index, 'scalar_coupling_constant'] += res_lgbm
    test_pred_xgb.loc[:, 'scalar_coupling_constant'] += res_test

test_pred_xgb['scalar_coupling_constant'] /= 20

Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l1: 0.0535833	valid_0's l1: 0.0535833
[200]	valid_0's l1: 0.0532768	valid_0's l1: 0.0532768
[300]	valid_0's l1: 0.053114	valid_0's l1: 0.053114
[400]	valid_0's l1: 0.0530188	valid_0's l1: 0.0530188
[500]	valid_0's l1: 0.0529418	valid_0's l1: 0.0529418
Early stopping, best iteration is:
[555]	valid_0's l1: 0.0529062	valid_0's l1: 0.0529062
-2.9392349486366625
Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l1: 0.0534275	valid_0's l1: 0.0534275
[200]	valid_0's l1: 0.0531135	valid_0's l1: 0.0531135
[300]	valid_0's l1: 0.0529056	valid_0's l1: 0.0529056
[400]	valid_0's l1: 0.0527716	valid_0's l1: 0.0527716
[500]	valid_0's l1: 0.0526998	valid_0's l1: 0.0526998
Early stopping, best iteration is:
[540]	valid_0's l1: 0.0526765	valid_0's l1: 0.0526765
-2.943585012375487
Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l1: 0.0482548	valid_0's l1: 0.0482548
[200]

In [20]:
val_xgb['mae'] = (val_xgb['scalar_coupling_constant'] - y).abs()
print(np.log(val_xgb.groupby('type')['mae'].mean()).mean())

-3.0863043130482577


In [21]:
np.log(val_xgb.groupby('type')['mae'].mean())

type
1JHC   -2.145392
1JHN   -2.236141
2JHC   -3.107093
2JHH   -3.676761
2JHN   -3.373819
3JHC   -3.037339
3JHH   -3.560656
3JHN   -3.553233
Name: mae, dtype: float64

# Stacking Huberloss 

In [None]:
preds = preds_val.loc[val_data['id']]
val_huber = val_data.copy()
val_huber['scalar_coupling_constant'] = 0
val_huber = val_huber.reset_index()
X = preds.values
y = val_data.scalar_coupling_constant.values
test_pred_huber = test.copy()
test_pred_huber['scalar_coupling_constant'] = 0
test_pred_huber = test_pred_huber.reset_index()
Xtest = preds_test.loc[test['id']].values
groups = val_huber.molecule_name.astype('category').cat.codes.values
types = val_huber.type.astype('category').cat.codes.values

for bond_type in val_huber.type.unique():
    sub = val_huber.loc[val_huber.type == bond_type]
    sub_groups = groups[sub.index]
    sub_X_type = X[sub.index]
    sub_y_type = y[sub.index]
#     sub_types = types[sub.index]
    
    sub_test = test_pred_huber.loc[test_pred_huber.type == bond_type]
    sub_Xtest = Xtest[sub_test.index]
    
    cv = GroupKFold(20)
    for train_index, val_index in cv.split(sub_X_type, sub_y_type, sub_groups):        
        clf = HuberRegressor(epsilon = 1.01, max_iter = 50000, alpha = 1e-6, tol = 1e-5)        
        X_train, X_val = sub_X_type[train_index], sub_X_type[val_index]
        y_train, y_val = sub_y_type[train_index], sub_y_type[val_index]        

#         type_train, type_test = sub_types[train_index], sub_types[test_index]

        clf.fit(
                np.concatenate([
                    X_train - X_train.mean(axis=1).reshape(-1, 1),
                    X_train, 
#                     type_train.reshape(-1, 1)
                ], axis = 1
                ),
                y_train - X_train.mean(axis = 1)
        )
        res_lgbm = clf.predict(np.concatenate([X_val - X_val.mean(axis = 1).reshape(-1, 1),
                                               X_val,
#                                                type_test.reshape(-1, 1)
                                              ], axis = 1)
                                               ) + X_val.mean(axis = 1)
        
        res_test = clf.predict(np.concatenate([
                                           sub_Xtest - sub_Xtest.mean(axis = 1).reshape(-1, 1),
                                           sub_Xtest,
        #                                            type_test.reshape(-1, 1)
                                        ], axis = 1)) + sub_Xtest.mean(axis = 1)
        
        print(bond_type, np.log(np.abs(res_lgbm - y_val).mean()))
        val_huber.loc[sub.index[val_index], 'scalar_coupling_constant'] += res_lgbm
        test_pred_huber.loc[sub_test.index, 'scalar_coupling_constant'] += res_test
        
test_pred_huber['scalar_coupling_constant'] /= 20

2JHC -3.027295782992844
2JHC -3.1037589839104265
2JHC -3.3142853453794454
2JHC -3.1454979800188934
2JHC -2.8071585767481047
2JHC -3.0865643204827222
2JHC -3.1934427873805165
2JHC -3.0881499446694827
2JHC -3.0977316505428893


In [None]:
val_huber['mae'] = (val_huber['scalar_coupling_constant'] - y).abs()
print(np.log(val_huber.groupby('type')['mae'].mean()).mean())

# Stacking final

In [None]:
val_huber.head()

In [None]:
val_xgb.head()

In [None]:
val_final = val_huber.copy()
val_final['scalar_coupling_constant'] = val_xgb['scalar_coupling_constant'] * 0.5 + val_huber['scalar_coupling_constant'] *0.5
val_final['mae'] = (val_final['scalar_coupling_constant'] - y).abs()
print(np.log(val_final.groupby('type')['mae'].mean()).mean())

# Submission

In [None]:
sub_final = test_pred_xgb.copy()
sub_final['scalar_coupling_constant'] = test_pred_xgb['scalar_coupling_constant'] * 0.5 + test_pred_huber['scalar_coupling_constant'] * 0.5

In [None]:
sub_final = sub_final.drop(['index', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type'], axis = 1)

In [None]:
sub_final.to_csv('sub_stacking.csv', index=False)

In [None]:
pd.read_csv('sub_stacking.csv').shape