In [1]:

import warnings
warnings.filterwarnings('ignore')

#the basics
import pandas as pd, numpy as np
import math, json, gc, random, os, sys
from matplotlib import pyplot as plt
from tqdm import tqdm

#tensorflow deep learning basics
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L

#for model evaluation
from sklearn.model_selection import train_test_split, KFold

In [2]:
#get comp data
train = pd.read_json('/kaggle/input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('/kaggle/input/stanford-covid-vaccine/test.json', lines=True)
sample_sub = pd.read_csv('/kaggle/input/stanford-covid-vaccine/sample_submission.csv')

In [3]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [4]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

In [5]:
def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

In [6]:
train_inputs = preprocess_inputs(train[train.signal_to_noise > 1])
train_labels = np.array(train[train.signal_to_noise > 1][target_cols].values.tolist()).transpose((0, 2, 1))

In [7]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def build_model(gru=1,seq_len=107, pred_len=68, dropout=0.5,
                embed_dim=75, hidden_dim=128):
    
    inputs = tf.keras.layers.Input(shape=(seq_len, 3))

    embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    
    reshaped = tf.keras.layers.SpatialDropout1D(.2)(reshaped)
    
    if gru==1:
        hidden = gru_layer(hidden_dim, dropout)(reshaped)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        
    elif gru==0:
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        
    elif gru==3:
        hidden = gru_layer(hidden_dim, dropout)(reshaped)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        
    elif gru==4:
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    elif gru==5:
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
    
    #only making predictions on the first part of each sequence
    truncated = hidden[:, :pred_len]
    
    out = tf.keras.layers.Dense(5, activation='linear')(truncated)

    model = tf.keras.Model(inputs=inputs, outputs=out)

    #some optimizers
    adam = tf.optimizers.Adam()
    radam = tfa.optimizers.RectifiedAdam()
    lookahead = tfa.optimizers.Lookahead(adam, sync_period=6)
    ranger = tfa.optimizers.Lookahead(radam, sync_period=6)
    
    model.compile(optimizer = adam, loss='mse')
    
    return model

# Training

**Create train/val split now so both models are trained and evaluated on the same samples:**

In [8]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels,
                                                                     test_size=.1, random_state=34)

In [9]:
if tf.config.list_physical_devices('GPU') is not None:
    print('Training on GPU')

Training on GPU


**We will use a simple learning rate callback for now:**

In [10]:
lr_callback = tf.keras.callbacks.ReduceLROnPlateau()

### 1. GRU

In [11]:
gru = build_model(gru=1)
sv_gru = tf.keras.callbacks.ModelCheckpoint('model_gru.h5')

history_gru = gru.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_gru],
    verbose = 2
)

print(f"Min training loss={min(history_gru.history['loss'])}, min validation loss={min(history_gru.history['val_loss'])}")

Epoch 1/100
30/30 - 3s - loss: 0.2401 - val_loss: 0.1841
Epoch 2/100
30/30 - 1s - loss: 0.1772 - val_loss: 0.1598
Epoch 3/100
30/30 - 1s - loss: 0.1642 - val_loss: 0.1501
Epoch 4/100
30/30 - 1s - loss: 0.1552 - val_loss: 0.1401
Epoch 5/100
30/30 - 2s - loss: 0.1428 - val_loss: 0.1314
Epoch 6/100
30/30 - 1s - loss: 0.1357 - val_loss: 0.1250
Epoch 7/100
30/30 - 1s - loss: 0.1305 - val_loss: 0.1211
Epoch 8/100
30/30 - 1s - loss: 0.1263 - val_loss: 0.1190
Epoch 9/100
30/30 - 1s - loss: 0.1245 - val_loss: 0.1164
Epoch 10/100
30/30 - 1s - loss: 0.1201 - val_loss: 0.1150
Epoch 11/100
30/30 - 1s - loss: 0.1175 - val_loss: 0.1107
Epoch 12/100
30/30 - 1s - loss: 0.1141 - val_loss: 0.1072
Epoch 13/100
30/30 - 1s - loss: 0.1110 - val_loss: 0.1046
Epoch 14/100
30/30 - 1s - loss: 0.1083 - val_loss: 0.1020
Epoch 15/100
30/30 - 1s - loss: 0.1058 - val_loss: 0.0990
Epoch 16/100
30/30 - 1s - loss: 0.1043 - val_loss: 0.0969
Epoch 17/100
30/30 - 1s - loss: 0.1021 - val_loss: 0.0966
Epoch 18/100
30/30 - 1s

### 2. LSTM

In [12]:
lstm = build_model(gru=0)
sv_lstm = tf.keras.callbacks.ModelCheckpoint('model_lstm.h5')

history_lstm = lstm.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_lstm],
    verbose = 2
)

print(f"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}")

Epoch 1/100
30/30 - 4s - loss: 0.2650 - val_loss: 0.2035
Epoch 2/100
30/30 - 2s - loss: 0.1869 - val_loss: 0.1666
Epoch 3/100
30/30 - 2s - loss: 0.1664 - val_loss: 0.1535
Epoch 4/100
30/30 - 2s - loss: 0.1558 - val_loss: 0.1427
Epoch 5/100
30/30 - 2s - loss: 0.1466 - val_loss: 0.1336
Epoch 6/100
30/30 - 2s - loss: 0.1370 - val_loss: 0.1293
Epoch 7/100
30/30 - 2s - loss: 0.1333 - val_loss: 0.1269
Epoch 8/100
30/30 - 2s - loss: 0.1294 - val_loss: 0.1211
Epoch 9/100
30/30 - 2s - loss: 0.1247 - val_loss: 0.1190
Epoch 10/100
30/30 - 2s - loss: 0.1213 - val_loss: 0.1132
Epoch 11/100
30/30 - 2s - loss: 0.1180 - val_loss: 0.1124
Epoch 12/100
30/30 - 2s - loss: 0.1144 - val_loss: 0.1085
Epoch 13/100
30/30 - 2s - loss: 0.1111 - val_loss: 0.1047
Epoch 14/100
30/30 - 2s - loss: 0.1076 - val_loss: 0.1032
Epoch 15/100
30/30 - 2s - loss: 0.1054 - val_loss: 0.1006
Epoch 16/100
30/30 - 2s - loss: 0.1021 - val_loss: 0.0975
Epoch 17/100
30/30 - 2s - loss: 0.1002 - val_loss: 0.0960
Epoch 18/100
30/30 - 2s

# 3. Hyb1

In [13]:
lstm = build_model(gru=3)
sv_lstm = tf.keras.callbacks.ModelCheckpoint('model_hyb1.h5')

history_lstm = lstm.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_lstm],
    verbose = 2
)

print(f"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}")

Epoch 1/100
30/30 - 3s - loss: 0.2484 - val_loss: 0.1838
Epoch 2/100
30/30 - 1s - loss: 0.1738 - val_loss: 0.1596
Epoch 3/100
30/30 - 1s - loss: 0.1619 - val_loss: 0.1515
Epoch 4/100
30/30 - 1s - loss: 0.1533 - val_loss: 0.1419
Epoch 5/100
30/30 - 2s - loss: 0.1431 - val_loss: 0.1318
Epoch 6/100
30/30 - 1s - loss: 0.1331 - val_loss: 0.1245
Epoch 7/100
30/30 - 1s - loss: 0.1261 - val_loss: 0.1178
Epoch 8/100
30/30 - 2s - loss: 0.1206 - val_loss: 0.1151
Epoch 9/100
30/30 - 2s - loss: 0.1171 - val_loss: 0.1109
Epoch 10/100
30/30 - 2s - loss: 0.1121 - val_loss: 0.1068
Epoch 11/100
30/30 - 1s - loss: 0.1089 - val_loss: 0.1018
Epoch 12/100
30/30 - 2s - loss: 0.1058 - val_loss: 0.0997
Epoch 13/100
30/30 - 2s - loss: 0.1020 - val_loss: 0.0972
Epoch 14/100
30/30 - 2s - loss: 0.1000 - val_loss: 0.0957
Epoch 15/100
30/30 - 2s - loss: 0.0964 - val_loss: 0.0923
Epoch 16/100
30/30 - 2s - loss: 0.0944 - val_loss: 0.0894
Epoch 17/100
30/30 - 2s - loss: 0.0919 - val_loss: 0.0875
Epoch 18/100
30/30 - 2s

# 4. Hyb2

In [14]:
lstm = build_model(gru=4)
sv_lstm = tf.keras.callbacks.ModelCheckpoint('model_hyb2.h5')

history_lstm = lstm.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_lstm],
    verbose = 2
)

print(f"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}")

Epoch 1/100
30/30 - 3s - loss: 0.2463 - val_loss: 0.1804
Epoch 2/100
30/30 - 1s - loss: 0.1747 - val_loss: 0.1609
Epoch 3/100
30/30 - 1s - loss: 0.1635 - val_loss: 0.1495
Epoch 4/100
30/30 - 1s - loss: 0.1525 - val_loss: 0.1388
Epoch 5/100
30/30 - 1s - loss: 0.1428 - val_loss: 0.1326
Epoch 6/100
30/30 - 1s - loss: 0.1377 - val_loss: 0.1302
Epoch 7/100
30/30 - 1s - loss: 0.1335 - val_loss: 0.1249
Epoch 8/100
30/30 - 1s - loss: 0.1298 - val_loss: 0.1219
Epoch 9/100
30/30 - 1s - loss: 0.1260 - val_loss: 0.1180
Epoch 10/100
30/30 - 1s - loss: 0.1239 - val_loss: 0.1159
Epoch 11/100
30/30 - 1s - loss: 0.1200 - val_loss: 0.1124
Epoch 12/100
30/30 - 1s - loss: 0.1168 - val_loss: 0.1110
Epoch 13/100
30/30 - 1s - loss: 0.1142 - val_loss: 0.1097
Epoch 14/100
30/30 - 1s - loss: 0.1118 - val_loss: 0.1045
Epoch 15/100
30/30 - 2s - loss: 0.1082 - val_loss: 0.1027
Epoch 16/100
30/30 - 1s - loss: 0.1055 - val_loss: 0.0985
Epoch 17/100
30/30 - 1s - loss: 0.1026 - val_loss: 0.0969
Epoch 18/100
30/30 - 1s

# 5. hyb3

In [15]:
lstm = build_model(gru=5)
sv_lstm = tf.keras.callbacks.ModelCheckpoint('model_hyb3.h5')

history_lstm = lstm.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_lstm],
    verbose = 2
)

print(f"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}")

Epoch 1/100
30/30 - 1s - loss: 0.2621 - val_loss: 0.1958
Epoch 2/100
30/30 - 1s - loss: 0.1828 - val_loss: 0.1666
Epoch 3/100
30/30 - 1s - loss: 0.1642 - val_loss: 0.1515
Epoch 4/100
30/30 - 1s - loss: 0.1522 - val_loss: 0.1438
Epoch 5/100
30/30 - 1s - loss: 0.1428 - val_loss: 0.1336
Epoch 6/100
30/30 - 1s - loss: 0.1353 - val_loss: 0.1288
Epoch 7/100
30/30 - 1s - loss: 0.1308 - val_loss: 0.1251
Epoch 8/100
30/30 - 1s - loss: 0.1278 - val_loss: 0.1227
Epoch 9/100
30/30 - 1s - loss: 0.1244 - val_loss: 0.1197
Epoch 10/100
30/30 - 1s - loss: 0.1219 - val_loss: 0.1171
Epoch 11/100
30/30 - 1s - loss: 0.1197 - val_loss: 0.1163
Epoch 12/100
30/30 - 1s - loss: 0.1178 - val_loss: 0.1130
Epoch 13/100
30/30 - 1s - loss: 0.1156 - val_loss: 0.1112
Epoch 14/100
30/30 - 1s - loss: 0.1135 - val_loss: 0.1105
Epoch 15/100
30/30 - 1s - loss: 0.1120 - val_loss: 0.1068
Epoch 16/100
30/30 - 1s - loss: 0.1089 - val_loss: 0.1062
Epoch 17/100
30/30 - 1s - loss: 0.1081 - val_loss: 0.1053
Epoch 18/100
30/30 - 1s

# Model Evaluation

# Inference and Submission

In [16]:
public_df = test.query("seq_length == 107").copy()
private_df = test.query("seq_length == 130").copy()

public_inputs = preprocess_inputs(public_df)
private_inputs = preprocess_inputs(private_df)

**Predict twice, one for the public leaderboard, the other for the private leaderboard:**

In [17]:
# build all models
gru_short = build_model(gru=1, seq_len=107, pred_len=107)
gru_long = build_model(gru=1, seq_len=130, pred_len=130)
lstm_short = build_model(gru=0, seq_len=107, pred_len=107)
lstm_long = build_model(gru=0, seq_len=130, pred_len=130)
hyb1_short = build_model(gru=3, seq_len=107, pred_len=107)
hyb1_long = build_model(gru=3, seq_len=130, pred_len=130)
hyb2_short = build_model(gru=4, seq_len=107, pred_len=107)
hyb2_long = build_model(gru=4, seq_len=130, pred_len=130)
hyb3_short = build_model(gru=5, seq_len=107, pred_len=107)
hyb3_long = build_model(gru=5, seq_len=130, pred_len=130)


# load pre-trained model weights
gru_short.load_weights('model_gru.h5')
gru_long.load_weights('model_gru.h5')
lstm_short.load_weights('model_lstm.h5')
lstm_long.load_weights('model_lstm.h5')
hyb1_short.load_weights('model_hyb1.h5')
hyb1_long.load_weights('model_hyb1.h5')
hyb2_short.load_weights('model_hyb2.h5')
hyb2_long.load_weights('model_hyb2.h5')
hyb3_short.load_weights('model_hyb3.h5')
hyb3_long.load_weights('model_hyb3.h5')

# and predict
gru_public_preds = gru_short.predict(public_inputs)
gru_private_preds = gru_long.predict(private_inputs)
lstm_public_preds = lstm_short.predict(public_inputs)
lstm_private_preds = lstm_long.predict(private_inputs)
hyb1_public_preds = hyb1_short.predict(public_inputs)
hyb1_private_preds = hyb1_long.predict(private_inputs)
hyb2_public_preds = hyb2_short.predict(public_inputs)
hyb2_private_preds = hyb2_long.predict(private_inputs)
hyb3_public_preds = hyb3_short.predict(public_inputs)
hyb3_private_preds = hyb3_long.predict(private_inputs)

**Now we just need to change the shape of each sample to the long format:**

In [18]:
preds_gru = []

for df, preds in [(public_df, gru_public_preds), (private_df, gru_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=target_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_gru.append(single_df)

preds_gru_df = pd.concat(preds_gru)
preds_gru_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,id_seqpos
0,0.802686,0.631891,1.887196,0.470607,0.66498,id_00073f8be_0
1,2.659551,3.646708,4.617129,3.636235,2.827826,id_00073f8be_1
2,1.323967,0.511773,0.578476,0.659918,0.60789,id_00073f8be_2
3,1.259606,1.062628,1.240213,1.662021,1.5598,id_00073f8be_3
4,0.808406,0.488987,0.586882,0.807934,0.800499,id_00073f8be_4


**Now we do the same for the LSTM model so we can blend their predictions:**

In [19]:
preds_lstm = []

for df, preds in [(public_df, lstm_public_preds), (private_df, lstm_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=target_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_lstm.append(single_df)

preds_lstm_df = pd.concat(preds_lstm)
preds_lstm_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,id_seqpos
0,0.740205,0.704198,1.973906,0.538578,0.732464,id_00073f8be_0
1,2.322335,3.062538,4.344035,3.259643,2.861405,id_00073f8be_1
2,1.578519,0.523952,0.661763,0.629576,0.733595,id_00073f8be_2
3,1.345294,1.09811,1.204903,1.573127,1.545305,id_00073f8be_3
4,0.858014,0.600732,0.575061,0.847883,0.799747,id_00073f8be_4


For hyb1:

In [20]:
preds_hyb1 = []

for df, preds in [(public_df, hyb1_public_preds), (private_df, hyb1_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=target_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_hyb1.append(single_df)

preds_hyb1_df = pd.concat(preds_hyb1)
preds_hyb1_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,id_seqpos
0,0.748213,0.659549,1.936393,0.571567,0.788597,id_00073f8be_0
1,2.467142,3.334278,4.816895,3.502726,2.962131,id_00073f8be_1
2,1.32988,0.480445,0.63829,0.511286,0.607352,id_00073f8be_2
3,1.228438,1.203523,1.293523,1.837928,1.727615,id_00073f8be_3
4,0.790498,0.567234,0.622166,0.812318,0.909496,id_00073f8be_4


For Hyb2

In [21]:
preds_hyb2 = []

for df, preds in [(public_df, hyb2_public_preds), (private_df, hyb2_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=target_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_hyb2.append(single_df)

preds_hyb2_df = pd.concat(preds_hyb2)
preds_hyb2_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,id_seqpos
0,0.754849,0.719557,1.983732,0.553823,0.747772,id_00073f8be_0
1,2.296725,2.988256,4.389574,3.097913,2.851385,id_00073f8be_1
2,1.412596,0.505168,0.629091,0.582657,0.621628,id_00073f8be_2
3,1.271609,1.131229,1.248865,1.623346,1.585649,id_00073f8be_3
4,0.846073,0.608528,0.62828,0.83842,0.925739,id_00073f8be_4


For hyb3

In [22]:
preds_hyb3 = []

for df, preds in [(public_df, hyb3_public_preds), (private_df, hyb3_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=target_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_hyb3.append(single_df)

preds_hyb3_df = pd.concat(preds_hyb3)
preds_hyb3_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,id_seqpos
0,0.589876,0.580705,1.907721,0.490118,0.644843,id_00073f8be_0
1,1.79867,2.7245,3.509731,2.724572,2.380638,id_00073f8be_1
2,0.985079,0.490844,0.610022,0.681317,0.702119,id_00073f8be_2
3,1.102712,0.989122,1.087607,1.471446,1.431479,id_00073f8be_3
4,0.768012,0.575922,0.602951,0.850199,0.855314,id_00073f8be_4


**And now we blend:**

In [23]:
blend_preds_df = pd.DataFrame()
blend_preds_df['id_seqpos'] = preds_gru_df['id_seqpos']
blend_preds_df['reactivity'] = 0.2*preds_gru_df['reactivity'] + 0.2*preds_lstm_df['reactivity'] + 0.2*preds_hyb1_df['reactivity'] + 0.2*preds_hyb2_df['reactivity'] + 0.2*preds_hyb3_df['reactivity']
blend_preds_df['deg_Mg_pH10'] = 0.2*preds_gru_df['deg_Mg_pH10'] + 0.2*preds_lstm_df['deg_Mg_pH10'] + 0.2*preds_hyb1_df['deg_Mg_pH10'] + 0.2*preds_hyb2_df['deg_Mg_pH10'] + 0.2*preds_hyb3_df['deg_Mg_pH10']
blend_preds_df['deg_pH10'] = 0.2*preds_gru_df['deg_pH10'] + 0.2*preds_lstm_df['deg_pH10'] + 0.2*preds_hyb1_df['deg_pH10'] + 0.2*preds_hyb2_df['deg_pH10'] + 0.2*preds_hyb3_df['deg_pH10']
blend_preds_df['deg_Mg_50C'] = 0.2*preds_gru_df['deg_Mg_50C'] + 0.2*preds_lstm_df['deg_Mg_50C'] + 0.2*preds_hyb1_df['deg_Mg_50C'] + 0.2*preds_hyb2_df['deg_Mg_50C'] + 0.2*preds_hyb3_df['deg_Mg_50C']
blend_preds_df['deg_50C'] = 0.2*preds_gru_df['deg_50C'] + 0.2*preds_lstm_df['deg_50C'] + 0.2*preds_hyb1_df['deg_50C'] + 0.2*preds_hyb2_df['deg_Mg_50C'] + 0.2*preds_hyb3_df['deg_Mg_50C']

In [24]:
submission = sample_sub[['id_seqpos']].merge(blend_preds_df, on=['id_seqpos'])

#sanity check
submission.head()

Unnamed: 0,id_seqpos,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,id_00073f8be_0,0.727166,0.65918,1.93779,0.524939,0.645996
1,id_00073f8be_1,2.308884,3.151256,4.335473,3.244218,2.894769
2,id_00073f8be_2,1.326008,0.502436,0.623528,0.612951,0.642562
3,id_00073f8be_3,1.241532,1.096922,1.215022,1.633573,1.585502
4,id_00073f8be_4,0.814201,0.568281,0.603068,0.831351,0.839672


In [25]:
submission.to_csv('submission.csv', index=False)
print('Submission saved')

Submission saved
