In [None]:
%load_ext tensorboard

In [None]:
import random

import numpy as np 
import pandas as pd
import torch
from torch import nn 
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

In [None]:
train_df = pd.read_csv("./folds/train_folds.csv")

In [None]:
test_df = pd.read_csv("https://storage.googleapis.com/umojahack2022/test.csv")

In [None]:
def get_seq_column_map(train, test, col):
    sequences = []
    for seq in train[col]:
        sequences.extend(list(seq))
    for seq in test[col]:
        sequences.extend(list(seq))
    unique = np.unique(sequences)
    return {k: v for k, v in zip(unique, range(len(unique)))}

def get_column_map(train, test, col):
    sequences = []
    unique_values = pd.concat([train[col], test[col]]).unique().tolist()
    return {k: v for k, v in zip(unique_values, range(len(unique_values)))}

In [None]:
amino_acid_map = get_seq_column_map(train_df, test_df, "Toxin_Kmer")
print("unique amino acid map",len(amino_acid_map))

antivenom_map = get_column_map(train_df, test_df, "Antivenom")
print("unique Antivenom map", len(antivenom_map))

In [None]:
amino_acid_map

In [None]:
antivenom_map

We will split the data into a training and a validation set

In [None]:
USE_FOLD = 0

train_split_df = train_df[train_df['fold'] != USE_FOLD].reset_index(drop=True)
val_split_df = train_df[train_df['fold'] == USE_FOLD].reset_index(drop=True)

We look at the GPU provided by Colab

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device : {device}")

We convert our data into a torch `Dataset`.
All datasets that represent a map from keys to data samples should subclass
`Dataset`. All subclasses should overwrite `__getitem__`, supporting fetching a data sample for a given key:

In [None]:
class AntivenomChallengeDataSet(Dataset):
    def __init__(
        self,
        amino_acid_map,
        antivenom_map,
        data,
        is_train,
        label_name=None,
      ):
        self.amino_acid_map = amino_acid_map
        self.antivenom_map = antivenom_map
        self.data = data
        self.is_train = is_train
        self.label_name = label_name

    def __len__(self):
        return len(self.data) 

    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        kmer_seq = torch.as_tensor([self.amino_acid_map[e] for e in list(row["Toxin_Kmer"])])
        antivenom = torch.as_tensor(self.antivenom_map[row["Antivenom"]])
        position_start = torch.as_tensor(row["Kmer_Position_start"])
        position_end = torch.as_tensor(row["Kmer_Position_end"])
        
        inputs = {
            "K_mer": kmer_seq,
            "antivenom": antivenom,
            "position_start": position_start,
            "position_end": position_end,
        }

        if self.is_train: 
            return inputs, torch.as_tensor([row[self.label_name]])
        return inputs

In [None]:
train_dataset = AntivenomChallengeDataSet(
    amino_acid_map=amino_acid_map,
    antivenom_map=antivenom_map,
    data=train_split_df,
    is_train=True,
    label_name="Signal",
)

val_dataset = AntivenomChallengeDataSet(
    amino_acid_map=amino_acid_map,
    antivenom_map=antivenom_map,
    data=val_split_df,
    is_train=True,
    label_name="Signal",
)

test_dataset = AntivenomChallengeDataSet(
    amino_acid_map=amino_acid_map,
    antivenom_map=antivenom_map,
    data=test_df,
    is_train=False,
)

In [None]:
train_dataset[0]

In [None]:
batch_size = 64
num_workers = 0
shuffle = True
drop_last = False

Now we create our PyTorch data loaders. These combine a dataset and a sampler, and provide an iterable over the given dataset.

In [None]:
train_data_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=num_workers,
    drop_last=drop_last,
    sampler=None,
    pin_memory =False,

)

val_data_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False,  # we do not want to drop the last batch during evaluation
    pin_memory =False,

)

test_data_loader= DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False,
    pin_memory =False,
)

In [None]:
x, y = iter(train_data_loader).next()

print(f"K_mer shape: {x['K_mer'].shape}")
print(f"antivenom shape: {x['antivenom'].shape}")
print(f"target shape: {y.shape}")

## Define the model
For this example we will build an LSTM architeture. It is your task to come up with more performant architectures to improve the scores.

In [None]:
class SeqModel(nn.Module):
    def __init__(
        self,
        K_mer_emb_size,
        K_mer_nunique,
        antivenom_emb_size,
        antivenom_unique,
        max_Position_start,
        Position_start_emb_size,
    ): 
        super().__init__()
        self.K_mer_emb_size = K_mer_emb_size        
        self.K_mer_nunique = K_mer_nunique                
        self.antivenom_emb_size = antivenom_emb_size  
        self.antivenom_unique = antivenom_unique    
        
        self.Kmer_emb_layer = nn.Embedding(
            num_embeddings=self.K_mer_nunique,
            embedding_dim=self.K_mer_emb_size,
        )
        self.Antivenom_emb = nn.Embedding(
            num_embeddings=self.antivenom_unique,
            embedding_dim=self.antivenom_emb_size,
        )
    
        self.Position_start_emb = nn.Embedding(
            num_embeddings=max_Position_start,
            embedding_dim=Position_start_emb_size,
        )
        self.Features = nn.Linear(
            in_features=self.antivenom_emb_size + Position_start_emb_size,
            out_features=128,
        )
        
        self.Lstm_layer_1 = nn.LSTM(
            input_size=self.K_mer_emb_size,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )
        self.Lstm_layer_2 = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
        )
        
        self.Linear_1 = nn.Linear(
            in_features=self.Lstm_layer_2.hidden_size + self.Features.out_features,
            out_features=512,
        )
        self.relu_1 = nn.ReLU()
        self.Linear_2 = nn.Linear(
            in_features=self.Linear_1.out_features, out_features=256,
        )
        self.relu_2 = nn.ReLU()
        self.Output = nn.Linear(
            in_features=self.Linear_2.out_features, out_features=1,
        )
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, inputs):
        kmer_emb = self.Kmer_emb_layer(inputs["K_mer"])
        antivenom_emb = self.Antivenom_emb(inputs["antivenom"])
        position_start_emb = self.Position_start_emb(inputs["position_start"])

        emb_features = torch.cat((antivenom_emb, position_start_emb), axis=1)
        features = self.Features(emb_features)
        
        lstm_1_seq, (lstm_1_h, lstm1_c) = self.Lstm_layer_1(kmer_emb)
        lstm_2_seq, (lstm_2_h, lstm2_c) = self.Lstm_layer_2(lstm_1_seq)

        lstm_h = torch.squeeze(lstm_2_h)
        emb = torch.cat((lstm_h, features), axis=1)
        emb = self.dropout(emb)
        linear_1 = self.relu_1(self.Linear_1(emb))
        linear_2 = self.relu_2(self.Linear_2(linear_1))
        output = self.Output(linear_2)
        return output

Now that the model architecture is defined we are goint to instantiate our model. For this we need to calculate `max_Position_start` in order to calculate the size of the embedding layer we will use to encode the start position. The maximum position that the train and test dataset can have is:


In [None]:
max_Position_start = pd.concat([train_df[["Kmer_Position_start"]], test_df[["Kmer_Position_start"]]]).Kmer_Position_start.max()+1

print(f"Max Position_start : {max_Position_start}")

In [None]:
num_epochs = 50
early_stopping = 10

model = SeqModel(
    K_mer_emb_size=512,
    K_mer_nunique=len(amino_acid_map),
    antivenom_emb_size=64,
    antivenom_unique=len(antivenom_map),
    max_Position_start=max_Position_start,
    Position_start_emb_size=64,
)

loss_fn = nn.HuberLoss(reduction='mean')

model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


writer = SummaryWriter()
writer.add_graph(model, {k: v.to(device) for k, v in next(iter(train_data_loader))[0].items()})

### Training the model
We define a simple training loop


In [None]:
def train_func(
    train_data_loader,
    val_data_loader,
    model,
    loss_fn,
    optimizer,
    num_epochs,
    device,
    early_stopping=5,
): 
    total_batches = len(train_data_loader)
    total_batches_val = len(val_data_loader)
    train_loss = []
    
    n_iter = 0
    for epoch in range(num_epochs): 
        tqdm_bar = tqdm(train_data_loader, desc=f"epoch {epoch}", position=0) 
        old_val_loss = np.inf
        wating = 0
        model.train()
        for batch_number, (X, y) in enumerate(tqdm_bar):
            y = y.type(torch.FloatTensor).to(device)
            X = {k: X[k].to(device) for k in X}
            
            optimizer.zero_grad()
            pred = model(X)
            loss = loss_fn(pred, y)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            loss = loss.item()
            train_loss.append(loss)

            writer.add_scalar("loss/train", loss, n_iter)
            n_iter += 1

            if batch_number % 25 == 0: 
                tqdm_bar.set_postfix(
                    {
                        "train": f"{batch_number}/{total_batches} loss: {loss:.3} epoch loss: {np.mean(train_loss):.3}",
                    },
                )

        val_tqdm_bar = tqdm(
            val_data_loader, desc=f"epoch {epoch}", position=0, leave=True,
        ) 
        val_loss = []
        model.eval()
        with torch.no_grad(): 
            for batch_number, (X, y) in enumerate(val_tqdm_bar):
                y = y.type(torch.FloatTensor).to(device)
                X = {k: X[k].to(device) for k in X}
                
                pred = model(X)
                val_loss.append(loss_fn(pred, y).item())

                writer.add_scalar("loss/validation", np.random.random(), n_iter)

                if batch_number % 25 == 0: 
                    val_tqdm_bar.set_postfix(
                        {
                            "valid": f"{batch_number}/{total_batches_val} val loss: {np.mean(val_loss):.3}"
                        },
                    )
        
        new_val_loss = np.mean(val_loss)

        if new_val_loss > old_val_loss:
            wating += wating
        else:
            old_val_loss = new_val_loss

        if wating > early_stopping:
            break

In [None]:
train_func(
    train_data_loader=train_data_loader,
    val_data_loader=val_data_loader,
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    num_epochs=num_epochs,
    device=device,
    early_stopping=early_stopping,
)

In [None]:
torch.save(model, "model2.pth")

### Sample baseline Submission
Finally we will prepare a baseline submission to Zindi 


In [None]:
def predict_test(data_loader, path): 
    model = torch.load(path).to(device)
    tqdm_bar = tqdm(data_loader, desc="Inference", position=0, leave=True) 
    total_batches = len(tqdm_bar)

    preds = []
    with torch.no_grad():
        for batch_number, X in enumerate(tqdm_bar):
            X= {k: X[k].to(device) for k in X}
            pred = model(X)
            preds.append(pred.cpu().numpy())

        preds = np.concatenate(preds)
        return preds

In [None]:
test_pred = predict_test(test_data_loader,"model2.pth")

In [None]:
sample_submission=test_df[["ID"]]
sample_submission["Signal"] = test_pred.reshape((-1))
sample_submission.to_csv("./submissions/sub2.csv",index=False)

That is it! Now we can upload the sample_submission.csv to Zindi! As a final thing lets look at it. 

In [None]:
sample_submission.head()

In [None]:
sample_submission["Signal"].hist()