In [1]:
import numpy as np
import pandas as pd
import json
import torch
from tqdm import tqdm
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
WEIGHTS_PATH = './weights.pth'
FEATURE_SIZE = 21
BATCH_SIZE = 256

In [3]:
def one_hot(categories, string):
    encoding = np.zeros((len(string), len(categories)))
    for idx, char in enumerate(string):
        encoding[idx, categories.index(char)] = 1
    return encoding

def featurize(entity):
    sequence = one_hot(list('ACGU'), entity['sequence'])
    structure = one_hot(list('.()'), entity['structure'])
    loop_type = one_hot(list('BEHIMSX'), entity['predicted_loop_type'])
    features = np.hstack([sequence, structure, loop_type])
    return features 

def char_encode(index, features, feature_size):
    half_size = (feature_size - 1) // 2
    
    if index - half_size < 0:
        char_features = features[:index+half_size+1]
        padding = np.zeros((int(half_size - index), char_features.shape[1]))
        char_features = np.vstack([padding, char_features])
    elif index + half_size + 1 > len(features):
        char_features = features[index-half_size:]
        padding = np.zeros((int(half_size - (len(features) - index))+1, char_features.shape[1]))
        char_features = np.vstack([char_features, padding])
    else:
        char_features = features[index-half_size:index+half_size+1]
    
    return char_features

In [4]:
class VaxDataset(Dataset):
    def __init__(self, path, test=False):
        self.path = path
        self.test = test
        self.features = []
        self.targets = []
        self.ids = []
        self.load_data()
    
    def load_data(self):
        with open(self.path, 'r') as text:
            for line in text:
                records = json.loads(line)
                features = featurize(records)
                
                for char_i in range(records['seq_scored']):
                    char_features = char_encode(char_i, features, FEATURE_SIZE)
                    self.features.append(char_features)
                    self.ids.append('%s_%d' % (records['id'], char_i))
                        
                if not self.test:
                    targets = np.stack([records['reactivity'], records['deg_Mg_pH10'], records['deg_Mg_50C']], axis=1)
                    self.targets.extend([targets[char_i] for char_i in range(records['seq_scored'])])
                    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        if self.test:
            return self.features[index], self.ids[index]
        else:
            return self.features[index], self.targets[index], self.ids[index]

In [5]:
test_dataset = VaxDataset('../input/stanford-covid-vaccine/test.json', test=True)
test_dataloader = DataLoader(test_dataset, BATCH_SIZE, num_workers=4, drop_last=False, pin_memory=True)

In [6]:
class Flatten(nn.Module):
    def forward(self, x):
        batch_size = x.shape[0]
        return x.view(batch_size, -1)

class VaxModel(nn.Module):
    def __init__(self):
        super(VaxModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Dropout(0.2),
            nn.Conv1d(14, 32, 1, 1),
            nn.PReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2),
            nn.Conv1d(32, 1, 1, 1),
            nn.PReLU(),
            Flatten(),
            nn.Dropout(0.2),
            nn.Linear(FEATURE_SIZE, 32),
            nn.PReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2),
            nn.Linear(32, 3),
        )
    
    def forward(self, features):
        return self.layers(features)

In [7]:
model = VaxModel()

In [8]:
sub = pd.read_csv('../input/stanford-covid-vaccine/sample_submission.csv', index_col='id_seqpos')

In [9]:
model.load_state_dict(torch.load(WEIGHTS_PATH))
model.eval()
for features, ids in tqdm(test_dataloader):
    features = features.permute(0,2,1).float()
    predictions = model(features)
    sub.loc[ids, ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']] = predictions.detach().numpy()

100%|██████████| 1236/1236 [00:04<00:00, 287.60it/s]


In [10]:
sub.head()

Unnamed: 0_level_0,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
id_seqpos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_00073f8be_0,0.880476,0.791534,0.0,0.737347,0.0
id_00073f8be_1,1.513938,1.828215,0.0,1.77948,0.0
id_00073f8be_2,1.25102,1.127661,0.0,1.170841,0.0
id_00073f8be_3,1.11505,0.934803,0.0,1.0662,0.0
id_00073f8be_4,0.920037,0.854155,0.0,0.937674,0.0


In [11]:
sub.to_csv('./submission.csv')