# Introduction

<span style="font-size:18px;"> The context of this notebook is Umojahack's advanced challenge 2022, organized by Instadeep and hosted by Zindi.<br>
The provided data presents amino acid sequences extracted from different species of snakes as well as different kinds antivenoms tested on it, meanwhile the target is a signal that expresses the binding of the antivenom with K-mers (of length 16) coming from the toxin's sequence.

# Notebook Setup

In [1]:
%load_ext tensorboard

In [2]:
import random, gc, time, os
import warnings
warnings.simplefilter('ignore')
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.model_selection import GroupKFold 
from torch import nn 
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import BertModel, BertTokenizer
from torch.utils.tensorboard import SummaryWriter

# Utils

In [3]:
def make_reproduceable(SEED=8):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    torch.backends.cudnn.deterministic = True

In [4]:
# Thanks to Assazzin (https://github.com/ASSAZZIN-01) for this wonderful function
def free_memory(sleep_time=0.1):
    """ Black magic function to free torch memory and some jupyter whims """
    gc.collect()
    torch.cuda.synchronize()
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(sleep_time)

# Data Preparation

In [5]:
class CSV_Data:
    def __init__(self, train_path, test_path):
        self.train = pd.read_csv(train_path)
        self.test = pd.read_csv(test_path)
        self.bert_map = self.seq_mapper(using = "Rostlab/prot_bert")
        free_memory()
        
    def seq_mapper(self, column = "Toxin_Kmer", using=None):
        s = set()
        for _, x in self.train.iterrows():
            l = []
            s = s.union(set(x[column]))
        for _, x in self.test.iterrows():
            l = []
            s = s.union(set(x[column]))
        vocab_size = len(s)
        chrs = list(s)
        if (using == None):
            ix = list(range(1,vocab_size+1))
            voc_map = {i:j for i,j in zip(chrs,ix)}
        else :
            tokenizer = BertTokenizer.from_pretrained(using, do_lower_case=False ) #"Rostlab/prot_bert"
            model = BertModel.from_pretrained(using) #"Rostlab/prot_bert"
            voc_map = {i:(model(**tokenizer(i, return_tensors = "pt"))[0][0,2,:]).detach().numpy() for i in chrs}
        return voc_map
            
    def species_mapper(self):
        l = []
        for i in list(set(self.train.Species).union(set(self.test.Species))):
            l.extend(i.split('_'))
        species_voc_set = list(set(l))
        species_voc_set_map = { k:v for k , v in zip(species_voc_set,range(1,len(species_voc_set)+1))}
        return species_voc_set_map
    
    def others_mapper(self, column):
        voc_map = list(set(self.train[column]).union(set(self.test[column])))
        vocab_size = len(voc_map)
        ix = range(1, len(voc_map)+1)
        voc_map = {i:j for i,j in zip(voc_map, ix)}
        return voc_map
        
    def add_bert_emb(self):
        if ("kmer_bert" in data.train.columns):
            pass
        else:
            print("- Adding Bert embedding feature...")
            tic = time.time()
            self.train["kmer_bert"] = self.train.Toxin_Kmer.apply(lambda x: [self.bert_map[e] for e in x])
            self.test["kmer_bert"] = self.test.Toxin_Kmer.apply(lambda x: [self.bert_map[e] for e in x])
            toc = time.time()
            print(f"- Adding Bert embedding feature finished in {toc-tic} seconds.")
            
    
    def concatenate(self):
        if ("sequence" in data.train.columns):
            pass
        else:
            print("- Concatenation operation starting...")
            tic = time.time()
            # For the train set
            cur_pos = self.train.iloc[0]["Kmer_Position_start"]
            cur_tox = self.train.iloc[0]["Toxin_UniprotID"]
            seq = self.train.iloc[0]["Toxin_Kmer"]
            lengths = []
            length = 1
            seq_list = []
            for i, x in self.train.iterrows():
                if (i==0):
                    continue
                elif (i!=len(self.train)-1):
                    if ((x["Kmer_Position_start"]>cur_pos) and (x["Toxin_UniprotID"]==cur_tox)):
                        seq += x["Toxin_Kmer"][-1]
                        length += 1
                    else:
                        seq_list.append(seq)
                        lengths.append(length)
                        seq = x["Toxin_Kmer"]
                        length = 1
                        cur_tox = x["Toxin_UniprotID"]
                    cur_pos = x["Kmer_Position_start"]

                else:
                    if ((x["Kmer_Position_start"]>cur_pos) and (x["Toxin_UniprotID"]==cur_tox)):
                        seq += x["Toxin_Kmer"][-1]
                        length += 1
                        seq_list.append(seq)
                        lengths.append(length)
                    else:
                        seq_list.append(seq)
                        lengths.append(length)
                        seq_list.append(x["Toxin_Kmer"])
                        lengths.append(1)

            seq_to_df = [s for i, s in enumerate(seq_list) for _ in range(lengths[i])]
            self.train["sequence"] = seq_to_df

            # For the test set
            # For the train set
            cur_pos = self.test.iloc[0]["Kmer_Position_start"]
            cur_tox = self.test.iloc[0]["Toxin_UniprotID"]
            seq = self.test.iloc[0]["Toxin_Kmer"]
            lengths = []
            length = 1
            seq_list = []
            for i, x in self.test.iterrows():
                if (i==0):
                    continue
                elif (i!=len(self.test)-1):
                    if ((x["Kmer_Position_start"]>cur_pos) and (x["Toxin_UniprotID"]==cur_tox)):
                        seq += x["Toxin_Kmer"][-1]
                        length += 1
                    else:
                        seq_list.append(seq)
                        lengths.append(length)
                        seq = x["Toxin_Kmer"]
                        length = 1
                        cur_tox = x["Toxin_UniprotID"]
                    cur_pos = x["Kmer_Position_start"]

                else:
                    if ((x["Kmer_Position_start"]>cur_pos) and (x["Toxin_UniprotID"]==cur_tox)):
                        seq += x["Toxin_Kmer"][-1]
                        length += 1
                        seq_list.append(seq)
                        lengths.append(length)
                    else:
                        seq_list.append(seq)
                        lengths.append(length)
                        seq_list.append(x["Toxin_Kmer"])
                        lengths.append(1)

            seq_to_df = [s for i, s in enumerate(seq_list) for _ in range(lengths[i])]
            self.test["sequence"] = seq_to_df
            toc = time.time()
            print(f"- Concatenation operation finished in {toc-tic} seconds")

    def seq_mask(self, max_len=364):
        if ("sequence_mask" in data.train.columns):
            pass
        else:
            self.train["sequence_mask"] = self.train.sequence.apply(lambda x: [0]*len(x)+max(0,max_len-len(x))*[1])
            self.test["sequence_mask"] = self.test.sequence.apply(lambda x: [0]*len(x)+max(0,max_len-len(x))*[1])
        

In [6]:
train_path = "https://storage.googleapis.com/umojahack2022/train.csv"
test_path = "https://storage.googleapis.com/umojahack2022/test.csv"
data = CSV_Data(train_path, test_path)

In [7]:
data.train

In [8]:
data.test

In [9]:
data.train.iloc[5]

In [10]:
class Dataset:
    def __init__(self, data, features, state, selected_train=None, selected_val=None):
        self.d = data
        self.features = features
        if (state == "Train select"):
            self.data = self.d.train.iloc[selected_train]
        elif (state == "Val select"):
            self.data = self.d.train.iloc[selected_val]
        elif (state == "Train"):
            self.data = self.d.train
        else:
            self.data = self.d.test
        self.state = state
        self.d.concatenate()
        self.d.seq_mask()
        self.d.add_bert_emb()
        self.kmer_map = self.d.seq_mapper()
        self.spec_map = self.d.species_mapper()
        self.antivenom_map = self.d.others_mapper("Antivenom")
        self.protfam_map = self.d.others_mapper("ProteinFam")


    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        d = dict()
        row = self.data.iloc[idx]
        d["kmer"] = torch.as_tensor([self.kmer_map[e] for e in row["Toxin_Kmer"]])
        if ("kmer_bert" in self.features):
            d["kmer_bert"] = torch.tensor(row["kmer_bert"], dtype = torch.float)
        if ("seq" in self.features):
            d["seq"] = torch.tensor([self.kmer_map[e] for e in row["sequence"]]+[0]*(364-len(row["sequence"])), dtype = torch.long)
        if ("seq_mask" in self.features):
            d["seq_mask"] = torch.tensor(row["sequence_mask"], dtype = torch.uint8)
        if ("species" in self.features):
            d["species"] = torch.tensor([self.spec_map[e] for e in row["Species"].split("_")], dtype = torch.long)
        if ("antivenom" in self.features):
            d["antivenom"] = torch.as_tensor(self.antivenom_map[row["Antivenom"]])
        if ("protfam" in self.features):
            d["protfam"] = torch.tensor(self.protfam_map[row["ProteinFam"]], dtype = torch.long)
        if ("pos_start" in self.features):
            d["pos_start"] = torch.as_tensor(row["Kmer_Position_start"])
        
        if (self.state in ["Train", "Train select", "Val select"]):
            signal = torch.as_tensor([row["Signal"]])
            return d, signal
        return d

In [11]:
make_reproduceable()

In [12]:
train_data = Dataset(data, features = ["kmer", "kmer_bert", "antivenom", "species", "protfam", "seq", "seq_mask", "pos_start"], state = "Train")
train_data_loader = DataLoader(train_data, shuffle=True, batch_size=64, num_workers=2)
x, y = iter(train_data_loader).next()

print(f"K_mer shape: {x['kmer'].shape}")
print(f"K_mer bert shape: {x['kmer_bert'].shape}")
print(f"antivenom shape: {x['antivenom'].shape}")
print(f"species shape: {x['species'].shape}")
print(f"protfam shape: {x['protfam'].shape}")
print(f"seq shape: {x['seq'].shape}")
print(f"seq mask shape: {x['seq_mask'].shape}")
print(f"pos start shape: {x['pos_start'].shape}")
print(f"target shape: {y.shape}")

# Training and Evaluation Functions

In [13]:
def train_func(
    train_data_loader,
    val_data_loader,
    features,
    model,
    loss_fn,
    optimizer,
    num_epochs,
    device,
    writer,
    early_stopping=5,
): 
    total_batches = len(train_data_loader)
    total_batches_val = len(val_data_loader)
    train_loss = []
    
    n_iter = 0
    for epoch in range(num_epochs): 
        free_memory()
        tqdm_bar = tqdm(train_data_loader, desc=f"epoch {epoch}", position=0) 
        old_val_loss = np.inf
        waiting = 0
        model.train()
        for batch_number, (X, y) in enumerate(tqdm_bar):
            y = y.type(torch.FloatTensor).to(device)
            X = {k: X[k].to(device) for k in features}
            
            optimizer.zero_grad()
            pred = model(X)
            loss = loss_fn(pred, y)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            loss = loss.item()
            train_loss.append(loss)

            writer.add_scalar("loss/train", loss, n_iter)
            n_iter += 1

            if batch_number % 25 == 0: 
                tqdm_bar.set_postfix(
                    {
                        "train": f"{batch_number}/{total_batches} loss: {loss:.3} epoch loss: {np.mean(train_loss):.3}",
                    },
                )

        val_tqdm_bar = tqdm(
            val_data_loader, desc=f"epoch {epoch}", position=0, leave=True,
        ) 
        val_loss = []
        model.eval()
        with torch.no_grad(): 
            for batch_number, (X, y) in enumerate(val_tqdm_bar):
                y = y.type(torch.FloatTensor).to(device)
                X = {k: X[k].to(device) for k in features}
                
                pred = model(X)
                val_loss.append(loss_fn(pred, y).item())

                writer.add_scalar("loss/validation", np.random.random(), n_iter)

                if batch_number % 25 == 0: 
                    val_tqdm_bar.set_postfix(
                        {
                            "valid": f"{batch_number}/{total_batches_val} val loss: {np.mean(val_loss):.3}"
                        },
                    )
        
        new_val_loss = np.mean(val_loss)

        if new_val_loss > old_val_loss:
            waiting += 1
        else:
            old_val_loss = new_val_loss

        if waiting > early_stopping:
            print("Early Stopping")
            break

In [56]:
def predict_test(data_loader, paths, device): 
    sub = dict()
    for i, path in enumerate(paths):
        model = torch.load(path).to(device)
        tqdm_bar = tqdm(data_loader, desc="Inference", position=0, leave=True) 
        total_batches = len(tqdm_bar)

        preds = []
        with torch.no_grad():
            for batch_number, X in enumerate(tqdm_bar):
                X= {k: X[k].to(device) for k in features}
                pred = model(X)
                preds.append(pred.cpu().numpy())

            preds = np.concatenate(preds)
        sub[str(i)]=preds
    return sub

In [28]:
def kfold_run(
    Train,
    features,
    Model,
    params,
    nsplits,
    loss_fn,
    num_epochs,
    device,
    batch_size,
    lr=5e-3,
    early_stopping=5):
    
    groups = Train.train["Toxin_UniprotID"]
    
    kf = GroupKFold(n_splits=nsplits)
    
    for fold, (tr_ix, val_ix) in enumerate(kf.split(groups, groups=groups)):
        train_data = Dataset(data, features, selected_train = tr_ix, state = "Train select")
        val_data = Dataset(data, features, selected_val = val_ix, state = "Val select")
        train_data_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=0)
        val_data_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size, num_workers=0)
        print(f"-----------------FOLD_{fold+1}-----------------")
        model = Model(params).to(device)
        writer = SummaryWriter()
        writer.add_graph(model, {k: v.to(device) for k, v in next(iter(train_data_loader))[0].items()})
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        train_func(train_data_loader,val_data_loader,features,model,loss_fn,optimizer,num_epochs,device,writer, early_stopping)
        torch.save(model, f"model_fold{fold+1}.pth")

In [17]:
class SimpleSeqModel(nn.Module):
    def __init__(self, params):
        #K_mer_emb_size=1024,K_mer_nunique=20,antivenom_emb_size=128,antivenom_unique=8,
        #species_unique=40,species_emb_size=512, protfam_unique=17, protfam_emb_size=256
 
        super().__init__()
        self.K_mer_emb_size = params["K_mer_emb_size"]       
        self.K_mer_nunique = 21             
        self.antivenom_emb_size = params["antivenom_emb_size"]
        self.antivenom_unique = 9    
        
        self.Kmer_emb_layer = nn.Embedding(
            num_embeddings=self.K_mer_nunique,
            embedding_dim=self.K_mer_emb_size,
        )
        self.Antivenom_emb = nn.Embedding(
            num_embeddings=self.antivenom_unique,
            embedding_dim=self.antivenom_emb_size,
        )
    
        self.protfam_emb = nn.Embedding(
            num_embeddings=18,
            embedding_dim=params["protfam_emb_size"],
        )
        self.Features = nn.Linear(
            in_features=self.antivenom_emb_size + params["protfam_emb_size"],
            out_features=128,
        )
        
        self.Lstm_layer_1 = nn.LSTM(
            input_size=self.K_mer_emb_size,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )
        self.Lstm_layer_2 = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
        )
        
        self.Linear_1 = nn.Linear(
            in_features=self.Lstm_layer_2.hidden_size + self.Features.out_features,
            out_features=256,
        )
        self.relu_1 = nn.ReLU()
        self.Linear_2 = nn.Linear(
            in_features=self.Linear_1.out_features, out_features=128,
        )
        self.relu_2 = nn.ReLU()
        self.Output = nn.Linear(
            in_features=self.Linear_2.out_features, out_features=1,
        )
        self.out = nn.Sequential(
            nn.Linear(in_features=self.Lstm_layer_2.hidden_size + self.Features.out_features,out_features=256),
            nn.Dropout(0.2),
            nn.ReLU(inplace = True),
            nn.Linear(in_features=256,out_features=128),
            nn.Dropout(0.1),
            nn.ReLU(inplace = True),
            nn.Linear(128,1)
        )
        
    def forward(self, inputs):
        kmer_emb = self.Kmer_emb_layer(inputs["kmer"])
        antivenom_emb = self.Antivenom_emb(inputs["antivenom"])
        protfam_emb = self.protfam_emb(inputs["protfam"])

        emb_features = torch.cat((antivenom_emb, protfam_emb), axis=1)
        features = self.Features(emb_features)
        
        lstm_1_seq, (lstm_1_h, lstm1_c) = self.Lstm_layer_1(kmer_emb)
        lstm_2_seq, (lstm_2_h, lstm2_c) = self.Lstm_layer_2(lstm_1_seq)

        lstm_h = torch.squeeze(lstm_2_h)
        emb = torch.cat((lstm_h, features), axis=1)
        """linear_1 = self.relu_1(self.Linear_1(emb))
        linear_2 = self.relu_2(self.Linear_2(linear_1))"""
        output = self.out(emb)
        return output   

In [None]:
"""class SimpleSeqModel(nn.Module):
    def __init__(
        self,
        K_mer_emb_size,
        K_mer_nunique,
        antivenom_emb_size,
        antivenom_unique,
        max_Position_start,
        Position_start_emb_size,
    ): 
        super().__init__()
        self.K_mer_emb_size = K_mer_emb_size       
        self.K_mer_nunique = K_mer_nunique +1               
        self.antivenom_emb_size = antivenom_emb_size
        self.antivenom_unique = antivenom_unique +1   
        
        self.Kmer_emb_layer = nn.Embedding(
            num_embeddings=self.K_mer_nunique,
            embedding_dim=self.K_mer_emb_size,
        )
        self.Antivenom_emb = nn.Embedding(
            num_embeddings=self.antivenom_unique,
            embedding_dim=self.antivenom_emb_size,
        )
    
        self.Position_start_emb = nn.Embedding(
            num_embeddings=max_Position_start,
            embedding_dim=Position_start_emb_size,
        )
        self.Features = nn.Linear(
            in_features=self.antivenom_emb_size + Position_start_emb_size,
            out_features=128,
        )
        
        self.Lstm_layer_1 = nn.LSTM(
            input_size=self.K_mer_emb_size,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )
        self.Lstm_layer_2 = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
        )
        
        self.Linear_1 = nn.Linear(
            in_features=self.Lstm_layer_2.hidden_size + self.Features.out_features,
            out_features=512,
        )
        self.relu_1 = nn.ReLU()
        self.Linear_2 = nn.Linear(
            in_features=self.Linear_1.out_features, out_features=256,
        )
        self.relu_2 = nn.ReLU()
        self.Output = nn.Linear(
            in_features=self.Linear_2.out_features, out_features=1,
        )
        
    def forward(self, inputs):
        kmer_emb = self.Kmer_emb_layer(inputs["kmer"])
        antivenom_emb = self.Antivenom_emb(inputs["antivenom"])
        position_start_emb = self.Position_start_emb(inputs["pos_start"])

        emb_features = torch.cat((antivenom_emb, position_start_emb), axis=1)
        features = self.Features(emb_features)
        
        lstm_1_seq, (lstm_1_h, lstm1_c) = self.Lstm_layer_1(kmer_emb)
        lstm_2_seq, (lstm_2_h, lstm2_c) = self.Lstm_layer_2(lstm_1_seq)

        lstm_h = torch.squeeze(lstm_2_h)
        emb = torch.cat((lstm_h, features), axis=1)
        linear_1 = self.relu_1(self.Linear_1(emb))
        linear_2 = self.relu_2(self.Linear_2(linear_1))
        output = self.Output(linear_2)
        return output
        
        """

In [27]:
params = {"K_mer_emb_size":512, "protfam_emb_size": 64, "antivenom_emb_size":64}
features = ["kmer", "antivenom", "protfam"]

In [29]:
kfold_run(
    data,
    features,
    SimpleSeqModel,
    params,
    nsplits=5,
    loss_fn=nn.MSELoss(),
    num_epochs=10,
    device=torch.device("cuda"),
    batch_size=512,
    early_stopping=5)

In [30]:
paths = ["model_fold"+str(i)+".pth" for i in range(1,6)]

In [32]:
test_data = Dataset(data, features, state = "Test")
test_data_loader = DataLoader(test_data, shuffle=False, batch_size=512, num_workers=0)
simplesub = predict_test(test_data_loader,paths, torch.device("cuda"))

In [35]:
submission = pd.DataFrame(columns = ["ID", "Signal"])
submission["ID"] = data.test["ID"]
submission["Signal"] = 0.2*(simplesub['0']+simplesub['1']+simplesub['2']+simplesub['3']+simplesub['4'])
plt.hist(submission["Signal"])
submission.to_csv("firstmodelubmission.csv", index = False)

In [42]:
class SimpleSeqBertModel(nn.Module):
    def __init__(self, params):
        #K_mer_emb_size=1024,K_mer_nunique=20,antivenom_emb_size=128,antivenom_unique=8,
        #species_unique=40,species_emb_size=512, protfam_unique=17, protfam_emb_size=256
 
        super().__init__()
        #self.K_mer_emb_size = params["K_mer_emb_size"]       
        self.K_mer_nunique = 21             
        self.antivenom_emb_size = params["antivenom_emb_size"]
        self.antivenom_unique = 9    
        
        """self.Kmer_emb_layer = nn.Embedding(
            num_embeddings=self.K_mer_nunique,
            embedding_dim=self.K_mer_emb_size,
        )"""
        self.Antivenom_emb = nn.Embedding(
            num_embeddings=self.antivenom_unique,
            embedding_dim=self.antivenom_emb_size,
        )
    
        self.protfam_emb = nn.Embedding(
            num_embeddings=18,
            embedding_dim=params["protfam_emb_size"],
        )
        self.Features = nn.Linear(
            in_features=self.antivenom_emb_size + params["protfam_emb_size"],
            out_features=128,
        )
        
        self.Lstm_layer_1 = nn.LSTM(
            input_size=1024,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )
        self.Lstm_layer_2 = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
        )
        
        self.Linear_1 = nn.Linear(
            in_features=self.Lstm_layer_2.hidden_size + self.Features.out_features,
            out_features=256,
        )
        self.relu_1 = nn.ReLU()
        self.Linear_2 = nn.Linear(
            in_features=self.Linear_1.out_features, out_features=128,
        )
        self.relu_2 = nn.ReLU()
        self.Output = nn.Linear(
            in_features=self.Linear_2.out_features, out_features=1,
        )
        self.out = nn.Sequential(
            nn.Linear(in_features=self.Lstm_layer_2.hidden_size + self.Features.out_features,out_features=256),
            nn.Dropout(0.2),
            nn.ReLU(inplace = True),
            nn.Linear(in_features=256,out_features=128),
            nn.Dropout(0.2),
            nn.ReLU(inplace = True),
            nn.Linear(128,1)
        )
        
    def forward(self, inputs):
        kmer_emb = inputs["kmer_bert"]
        antivenom_emb = self.Antivenom_emb(inputs["antivenom"])
        protfam_emb = self.protfam_emb(inputs["protfam"])

        emb_features = torch.cat((antivenom_emb, protfam_emb), axis=1)
        features = self.Features(emb_features)
        
        lstm_1_seq, (lstm_1_h, lstm1_c) = self.Lstm_layer_1(kmer_emb)
        lstm_2_seq, (lstm_2_h, lstm2_c) = self.Lstm_layer_2(lstm_1_seq)

        lstm_h = torch.squeeze(lstm_2_h)
        emb = torch.cat((lstm_h, features), axis=1)
        """linear_1 = self.relu_1(self.Linear_1(emb))
        linear_2 = self.relu_2(self.Linear_2(linear_1))"""
        output = self.out(emb)
        return output   

In [43]:
params = {"protfam_emb_size": 64, "antivenom_emb_size":64}
features = ["kmer_bert", "antivenom", "protfam"]

In [44]:
kfold_run(
    data,
    features,
    SimpleSeqBertModel,
    params,
    nsplits=5,
    loss_fn=nn.MSELoss(),
    num_epochs=10,
    device=torch.device("cuda"),
    batch_size=512,
    early_stopping=5)

In [61]:
class GRU_LSTM_CNN(nn.Module):
    def __init__(self, params):
        super(GRU_LSTM_CNN, self).__init__()
        self.K_mer_emb_size = params["K_mer_emb_size"]       
        self.K_mer_nunique = 21             
        self.antivenom_emb_size = params["antivenom_emb_size"]
        self.antivenom_unique = 9   
        self.Kmer_emb_layer = nn.Embedding(
            num_embeddings=self.K_mer_nunique,
            embedding_dim=self.K_mer_emb_size,
        )
        self.dropout1 = nn.Dropout2d(p=0.2)
        self.gru = nn.GRU(self.K_mer_emb_size, 64, bidirectional = True, batch_first = True)
        self.cv1 = nn.Conv1d(128, 32, 3, stride = 2)
        
        self.lstm = nn.LSTM(self.K_mer_emb_size, 64, bidirectional= True, batch_first = True)
        self.cv2 = nn.Conv1d(128, 32, 3, stride = 2)
        
        self.fc1 = nn.Sequential(
                    nn.Linear(4*32, 512),
                    nn.Dropout(0.2),
                    nn.ReLU(inplace = True),
                    nn.Linear(512, 128)
                    )
        
        self.Antivenom_emb = nn.Embedding(
            num_embeddings=self.antivenom_unique,
            embedding_dim=self.antivenom_emb_size,
        )
    
        self.protfam_emb = nn.Embedding(
            num_embeddings=18,
            embedding_dim=params["protfam_emb_size"],
        )
        self.Features = nn.Linear(
            in_features=self.antivenom_emb_size + params["protfam_emb_size"],
            out_features=128,
        )
        
        self.fc2 = nn.Sequential(
                    nn.Linear(256,512),
                    nn.Dropout(0.2),
                    nn.ReLU(inplace = True),
                    nn.Linear(512,256),
                    nn.Dropout(0.1),
                    nn.ReLU(inplace = True),
                    nn.Linear(256,64),
                    nn.Dropout(0.05),
                    nn.ReLU(inplace = True),
                    nn.Linear(64,1))



    def forward(self,inputs):
        k_mer = self.Kmer_emb_layer(inputs["kmer"])
        k_mer = self.dropout1(k_mer)
        
        x1, _ = self.gru(k_mer)
        x1=x1.permute(0, 2, 1)
        x1 = self.cv1(x1)
        x1=x1.permute(0, 2, 1)
        maxpoolx1, _ = torch.max(x1, axis = 1)
        avgpoolx1 = torch.mean(x1, axis = 1)
        x1 = torch.cat([maxpoolx1, avgpoolx1], dim = 1)
        
        x2, _ = self.lstm(k_mer)
        x2= x2.permute(0, 2, 1)
        x2 = self.cv2(x2)
        x2 = x2.permute(0, 2, 1)
        maxpoolx2, _ = torch.max(x2, axis = 1)
        avgpoolx2 = torch.mean(x2, axis = 1)
        x2 = torch.cat([maxpoolx2, avgpoolx2], dim = 1)
        
        x = torch.cat([x1, x2], dim = 1)
        x = self.fc1(x)
        
        
        antivenom_emb = self.Antivenom_emb(inputs["antivenom"])
        protfam_emb = self.protfam_emb(inputs["protfam"])

        emb_features = torch.cat((antivenom_emb, protfam_emb), axis=1)
        features = self.Features(emb_features)        
        
        features = torch.cat([features, x], dim = 1)
        
        y = self.fc2(features)
        return y


In [63]:
params = {"K_mer_emb_size":512, "protfam_emb_size": 64, "antivenom_emb_size":64}
features = ["kmer", "antivenom", "protfam"]
kfold_run(
    data,
    features,
    GRU_LSTM_CNN,
    params,
    nsplits=5,
    loss_fn=nn.MSELoss(),
    num_epochs=10,
    device=torch.device("cuda"),
    batch_size=512,
    early_stopping=5)

In [48]:
class TimeDistributed(nn.Module):
    def __init__(self, module, batch_first=False):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):
        if len(x.size()) <= 2:
            return self.module(x)
        x_reshape = x.contiguous().view(-1, x.size(-1))
        y = self.module(x_reshape)
        if self.batch_first:
            y = y.contiguous().view(x.size(0), -1, y.size(-1))
        else:
            y = y.view(-1, x.size(1), y.size(-1))
        return y
    
class GRU_LSTM_CNN(nn.Module):
    def __init__(self, params):
        super(GRU_LSTM_CNN, self).__init__()
        self.K_mer_emb_size = params["K_mer_emb_size"]       
        self.K_mer_nunique = 21             
        self.antivenom_emb_size = params["antivenom_emb_size"]
        self.antivenom_unique = 9   
        self.Kmer_emb_layer = nn.Embedding(
            num_embeddings=self.K_mer_nunique,
            embedding_dim=self.K_mer_emb_size,
        )
        self.init_batchnorm = TimeDistributed(
            nn.BatchNorm1d(self.K_mer_emb_size, momentum=0.01), batch_first=True
        )
        self.dropout1 = nn.Dropout2d(p=0.2)
        self.gru = nn.GRU(self.K_mer_emb_size, 64, num_layers = 3, bidirectional = True, batch_first = True)
        self.batchnorm1 = TimeDistributed(
            nn.BatchNorm1d(128+512, momentum=0.01), batch_first=True
        )
        self.cv1 = nn.Conv1d(128+512, 32, 4, stride = 1)
        
        self.lstm = nn.LSTM(self.K_mer_emb_size, 64, num_layers = 3, bidirectional= True, batch_first = True)
        self.batchnorm2 = TimeDistributed(
            nn.BatchNorm1d(128+512, momentum=0.01), batch_first=True
        )
        self.cv2 = nn.Conv1d(128+512, 32, 4, stride = 1)
        
        self.fc1 = nn.Sequential(
                    nn.Linear(4*32, 512),
                    nn.Dropout(0.2),
                    nn.ReLU(inplace = True),
                    nn.Linear(512, 128)
                    )
        
        self.Antivenom_emb = nn.Embedding(
            num_embeddings=self.antivenom_unique,
            embedding_dim=self.antivenom_emb_size,
        )
    
        self.protfam_emb = nn.Embedding(
            num_embeddings=18,
            embedding_dim=params["protfam_emb_size"],
        )
        self.Features = nn.Sequential(
            nn.Linear(in_features=self.antivenom_emb_size + params["protfam_emb_size"],out_features=256),
            nn.Dropout(0.2),
            nn.ReLU(inplace = True),
            nn.Linear(256, 128)
        )
        
        self.fc2 = nn.Sequential(
                    nn.Linear(256,512),
                    nn.Dropout(0.2),
                    nn.ReLU(inplace = True),
                    nn.Linear(512,256),
                    nn.Dropout(0.15),
                    nn.ReLU(inplace = True),
                    nn.Linear(256,64),
                    nn.Dropout(0.1),
                    nn.ReLU(inplace = True),
                    nn.Linear(64,1))



    def forward(self,inputs):
        k_mer = self.init_batchnorm(self.Kmer_emb_layer(inputs["kmer"]))
        k_mer = self.dropout1(k_mer)
        
        x1, _ = self.gru(k_mer)
        x1 = self.batchnorm1(torch.cat([x1,k_mer],dim=2))
        x1=x1.permute(0, 2, 1)
        x1 = self.cv1(x1)
        x1=x1.permute(0, 2, 1)
        maxpoolx1, _ = torch.max(x1, axis = 1)
        avgpoolx1 = torch.mean(x1, axis = 1)
        x1 = torch.cat([maxpoolx1, avgpoolx1], dim = 1)
        
        x2, _ = self.lstm(k_mer)
        x2 = self.batchnorm2(torch.cat([x2,k_mer],dim=2))
        x2= x2.permute(0, 2, 1)
        x2 = self.cv2(x2)
        x2 = x2.permute(0, 2, 1)
        maxpoolx2, _ = torch.max(x2, axis = 1)
        avgpoolx2 = torch.mean(x2, axis = 1)
        x2 = torch.cat([maxpoolx2, avgpoolx2], dim = 1)
        
        x = torch.cat([x1, x2], dim = 1)
        x = self.fc1(x)
        
        
        antivenom_emb = self.Antivenom_emb(inputs["antivenom"])
        protfam_emb = self.protfam_emb(inputs["protfam"])

        emb_features = torch.cat((antivenom_emb, protfam_emb), axis=1)
        features = self.Features(emb_features)        
        
        features = torch.cat([features, x], dim = 1)
        
        y = self.fc2(features)
        return y


In [34]:
params = {"K_mer_emb_size":512, "protfam_emb_size": 64, "antivenom_emb_size":64}
features = ["kmer", "antivenom", "protfam"]
kfold_run(
    data,
    features,
    GRU_LSTM_CNN,
    params,
    lr = 1e-3,
    nsplits=5,
    loss_fn=nn.MSELoss(),
    num_epochs=8,
    device=torch.device("cuda"),
    batch_size=512,
    early_stopping=2)

In [36]:
paths = ["model_fold"+str(i)+".pth" for i in range(1,6)]
test_data = Dataset(data, features, state = "Test")
test_data_loader = DataLoader(test_data, shuffle=False, batch_size=512, num_workers=0)
simplesub = predict_test(test_data_loader,paths, torch.device("cuda"))

In [37]:
submission = pd.DataFrame(columns = ["ID", "Signal"])
submission["ID"] = data.test["ID"]
submission["Signal"] = 0.2*(simplesub['0']+simplesub['1']+simplesub['2']+simplesub['3']+simplesub['4'])
plt.hist(submission["Signal"])
submission.to_csv("firstmodelubmission.csv", index = False)

In [49]:
params = {"K_mer_emb_size":512, "protfam_emb_size": 64, "antivenom_emb_size":64}
features = ["kmer", "antivenom", "protfam"]
kfold_run(
    data,
    features,
    GRU_LSTM_CNN,
    params,
    lr = 1e-3,
    nsplits=5,
    loss_fn=nn.MSELoss(),
    num_epochs=8,
    device=torch.device("cuda"),
    batch_size=512,
    early_stopping=2)

In [50]:
paths = ["model_fold"+str(i)+".pth" for i in range(1,6)]
test_data = Dataset(data, features, state = "Test")
test_data_loader = DataLoader(test_data, shuffle=False, batch_size=512, num_workers=0)
simplesub = predict_test(test_data_loader,paths, torch.device("cuda"))

In [51]:
submission = pd.DataFrame(columns = ["ID", "Signal"])
submission["ID"] = data.test["ID"]
submission["Signal"] = 0.2*(simplesub['0']+simplesub['1']+simplesub['2']+simplesub['3']+simplesub['4'])
plt.hist(submission["Signal"])
submission.to_csv("firstmodelubmission.csv", index = False)