# Embedings precomputing

### Install dependancies and mount GDrive


In [22]:
# Check if running in Google Colab to run dumb-colab speciefic code
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    COLAB = True
else:
    print('Not running on CoLab')
    COLAB=False


Not running on CoLab


In [47]:
# Other than torch dependencies
# ! pip install fair-esm # Switched to Ankh embeddings
! pip install GPUtil
! pip install pynvml
! pip install ankh
! pip install tqdm



In [81]:
import torch
print("Pytorch " + torch.__version__)
import esm
import pandas as pd
import numpy as np
import time
from GPUtil import showUtilization as gpu_usage

import ankh

from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import Trainer, TrainingArguments, EvalPrediction
from datasets import load_dataset

from sklearn import metrics
from scipy import stats
from functools import partial
from tqdm.auto import tqdm


# Create device agnostic code
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

Pytorch 1.13.1


'cuda'

### Load data from TSV


In [27]:
if COLAB:
    # Mount GDrive for Colab
    from google.colab import drive
    drive.mount('/content/drive')
    # Navigate Colab
    %cd /content/drive/MyDrive/Colab Notebooks/esm
    %ls

In [88]:
# Download dataset ~56MB
import requests

url = 'https://raw.githubusercontent.com/Dart-ilder/Cas_deep_search/main/esm/emb_pregen/cas_dataset_kira.tsv'
response = requests.get(url)

with open('./cas_dataset_kira.tsv', 'w') as file:
    file.write(response.text)

with open('./cas_dataset_kira.tsv') as file:
    cas_voc = pd.read_csv(file, delimiter="\t", comment='=')
cas_voc.head(3)

# Model download

In [None]:
model, tokenizer = ankh.load_base_model()
model.eval()
model.to(DEVICE)

In [None]:
cas_voc.head(3)

In [29]:
def cache_clear(): # In colab to clear GPU cache you need to wait some time after deleting tensor
    if COLAB:
        time.sleep(0.02)
    torch.cuda.empty_cache()
    
def gpu_util(): # To monitor how much more can we load GPU with data
    if DEVICE == "cuda":
        return torch.cuda.memory_reserved(DEVICE)/torch.cuda.get_device_properties(DEVICE).total_memory
    if DEVICE == "cpu":
        return 0

In [None]:
protein_sequences = list(cas_voc["Prot"].values)
protein_sequences = protein_sequences[:100]
inputs = tokenizer.batch_encode_plus(protein_sequences, 
                                        add_special_tokens=True, 
                                        padding=True,
                                        is_split_into_words=False, 
                                        return_tensors="pt")



In [None]:
inputs = inputs.to(DEVICE)
model.to(DEVICE)
gpu_usage()
with torch.no_grad():
    embeddings = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])[0]
gpu_usage()

# ESM leftovers. Legacy


In [None]:
# Load ESM-2 model
torch.cuda.empty_cache()
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

# device = "cpu" # My GPU doesn't have enough VRAM
# model.to(device)
device

### Preprocessing data and tokenisation

In [38]:
# Prepare data into format [ (label, seq), ]. We also cut * end of protein sequence symbol
# I leave Loci_id as an unicue identificator of an entry and Gene_family as a target label
data = list()
for id, seq in cas_voc.iterrows():
    data.append((f">{seq.Gene_id}|{seq.Gene_family}|{seq.Loci_id}", seq.Prot))


In [39]:
# Prepare data
# Takes in data in format of [ (label, seq), ] list. Applies tokens preprocessing
# Returns: only_lables_batched, only_seqs_batched, seq_tokenized_batched 
data = data[:]
batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1) # returns lengths of tokenized seqs without padding

#batch_tokens = batch_tokens.to(device) # batch_converter automatically detects and moves data to gpu. But mine has too little VRAM
#print(np.array(batch_tokens.to("cpu")).shape)


In [42]:
batch_tokens.shape

<function Tensor.type>

### Run the model and get sequence representations

In [None]:
# Extract per-residue representations (on CPU)
# Makes sence to extract only lasta layer representations. For 650M model it's layer 33
model = model.to(device)
batch_tokens_slice = batch_tokens.to(device)
with torch.inference_mode():
    results = model(batch_tokens, repr_layers=[33])
token_representations = results["representations"][33].to("cpu") # 1280 dimentional (for 650M model) representations for each residue in each data entry
token_representations.cpu()
print(np.array(token_representations).shape)


OutOfMemoryError: ignored

In [None]:
# Generate per-sequence representations via averaging
# NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
sequence_representations = []
print(np.array(token_representations).shape)
print(batch_lens)
for i, tokens_len in enumerate(batch_lens):
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
print(np.array(sequence_representations).shape)

(20, 1376, 1280)
tensor([ 222,  103,  294, 1376,  190,  350,  325,  437, 1084,  328,  323, 1126,
         418,  296,  339,  205,  230,  319,  243, 1125])
(20,)


  print(np.array(sequence_representations).shape)
  print(np.array(sequence_representations).shape)
