<a href="https://colab.research.google.com/github/casblaauw/BertOGlyc/blob/main/ProtBert_NetOGlyc_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<b> Work in progress!</b>

In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from google.colab import files, drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


Steps to build:
- Receive input as list of strings of some kind (list, series, ???)
- Filter out if above 4000 and display message
- Add padding
- Get embeddings
- Run predictor model
- Turn scores into predictions

<b> Define and load in model </b>

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(in_channels = 1024, out_channels = 32, kernel_size = 7, padding = 3) 
        self.dropout = nn.Dropout(p=0.25)
        self.conv2 = nn.Conv1d(in_channels = 32, out_channels = 2, kernel_size = 7, padding = 3)

    def forward(self, x):
        # ---- Layer 1
        # conv1 needs (batch_size, in_channels/features, length/seq_len), so (64, 1024, 4000) 
        # and outputs (64, 32, 4000)
        x = self.conv1(x)

        # ---- Process first layer's output
        x = self.dropout(x)
        x = F.relu(x)

        # ---- Layer 2
        # conv2 takes (64, 32, 4000) and outputs (64, 2, 4000)
        x = self.conv2(x)
        
        return x

model = Net()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# params_path = '/content/drive/MyDrive/NetOGlyc/model_params_beta099999.pth'
params_path = '/content/drive/MyDrive/NetOGlyc/model_params_loss30.pth'
params = torch.load(params_path, map_location = device)
model.load_state_dict(params)
model.eval()
model.to(device)

cpu


Net(
  (conv1): Conv1d(1024, 32, kernel_size=(7,), stride=(1,), padding=(3,))
  (dropout): Dropout(p=0.25, inplace=False)
  (conv2): Conv1d(32, 2, kernel_size=(7,), stride=(1,), padding=(3,))
)

<b> Load embeddings to test </b>

In [None]:
# Temp data generation for testing
zip_path = '/content/drive/MyDrive/NetOGlyc/embeddings_npy.zip'
gene = 'O00391'
with np.load(zip_path) as zip:
  data = zip[f"embeddings_{gene}"]

data = torch.Tensor([data.T])

In [None]:
# Load in info
info = pd.read_csv('/content/drive/MyDrive/NetOGlyc/embeddings_info.txt', sep = '\t', index_col = 'gene')
info['sequence'] = info['sequence'].apply(list)
info['label'] = info['label'].apply(lambda x: list(map(int, list(x))))

<b> Test model performance </b>

In [None]:
# Invividual gene labels
with torch.no_grad():
  with np.load(zip_path) as zip:
    for gene in info.index:
      # Get embeddings
      embeddings = zip[f"embeddings_{gene}"]
      embeddings = torch.Tensor([embeddings.T])

      # Make predictions and get true values
      scores = model(embeddings.float())
      preds = torch.argmax(scores, 1).squeeze().numpy()
      labels = np.array(info.loc[gene, 'label'])

In [None]:
## Check performance on entire training set
## Biased, as the model is trained on 80% of this, so don't use for finetuning.
## Still reveals large-scale model performance trends.

# Initialise performance tracking objects
results = {}
tp_counter = 0
pos_counter = 0
tn_counter = 0
neg_counter = 0
sites_bool = []

# Start predicting
with torch.no_grad():
  with np.load(zip_path) as zip:
    for gene in info.index:
      # Get embeddings
      embeddings = zip[f"embeddings_{gene}"]
      embeddings = torch.Tensor([embeddings.T])

      # Make predictions and get true values
      scores = model(embeddings.float())
      preds = torch.argmax(scores, 1).squeeze().numpy()
      labels = np.array(info.loc[gene, 'label'])

      # Compare preds to ground truth
      overall_pred = labels == preds
      pos_pred = labels[preds == 1] == preds[preds == 1]
      neg_pred = labels[preds == 0] == preds[preds == 0]
      sites_found = (labels[labels == 1] == preds[labels == 1]).tolist()
      print(f"Gene {gene}: {f'{sum(pos_pred)}/{len(pos_pred)}':^5} site preds correct, {f'{sum(neg_pred)}/{len(neg_pred)}':^9} non-site (+padding) correct. {f'{sum(sites_found)}/{len(sites_found)}':^5} sites recovered. Overall: {f'{sum(overall_pred)}/{len(overall_pred)}':^9}.")
      
      # Save results
      results[gene] = [labels, preds, pos_pred, neg_pred, overall_pred]
      tp_counter += sum(pos_pred)
      pos_counter += len(pos_pred)
      tn_counter += sum(neg_pred)
      neg_counter += len(neg_pred)
      sites_bool += sites_found

print('-'*50)
print(f"Total: {tp_counter}/{pos_counter} site preds correct, {tn_counter}/{neg_counter} non-site preds correct. {f'{sum(sites_bool)}/{len(sites_bool)}'} total sites recovered.")

In [None]:
## Other loop: puts into perspective # of S/T's correctly labeled
# When interpreting, keep in mind: no guarantee that pred sites contains all true sites, 
# (although the model does seem to capture them pretty consistently when site predictions are highly weighted).
# Also no guarantee that all pred sites are S/T, but that seems to be almost 100% in my experience, clearly extracted from embeddings.

for key in results.keys():
  print(key)
  site_res = np.array(info.loc[key, 'sequence'])[results[key][0] == 1]
  site_loc = np.where(results[key][0] == 1)
  pred_res = np.array(info.loc[key, 'sequence'])[results[key][1] == 1]
  pred_loc = np.where(results[key][1] == 1)
  res_count = info.loc[key, 'sequence'].count('S') + info.loc[key, 'sequence'].count('T')
  print(f"True sites: {len(site_res)}, pred sites: {len(pred_res)}, S/T count: {res_count}")