In [1]:
! pip install fair-esm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
print(torch.__version__)
import esm
import pandas as pd
import numpy as np
# Create device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

2.0.0+cu118


'cuda'

In [3]:

%cd /content/drive/MyDrive/Colab Notebooks/esm
%ls



/content/drive/MyDrive/Colab Notebooks/esm
cas_dataset_kira.tsv  cas_voc.fasta  esm.ipynb


In [4]:
with open('./cas_dataset_kira.tsv') as file:
    cas_voc = pd.read_csv(file, delimiter="\t", comment='=')
cas_voc.head()

Unnamed: 0,Loci_id,Strand,Genome_assembly,Chromosome,N1,Protein_id,B2,Gene_id,Gene_family,Type,Species,Start,End,Seq,Prot
0,RN99_05230,-,GCA_001296125.1,CP012714.1,991.0,ALF19893_1,+,cd09644,csn2,CAS-II-A,Fusobacterium_nucleatum_sub_vincentii_ChDC_F8_...,1100111,1100773,TTAGAAAATTTCACATAAATCATTATCTATAACAATTAAATTATCA...,MTFQYKGFNFKIDFEEKNIFSLIVENKRAYRKIIEDLVNNSNIEDG...
1,RN99_05235,-,GCA_001296125.1,CP012714.1,992.0,ALF20727_1,+,mkCas0206,cas2,CAS-II-A,Fusobacterium_nucleatum_sub_vincentii_ChDC_F8_...,1100770,1101075,TCATAAAACCACAAGCCTTTCATCTGTTTCTAAAAATGTCCCTTTT...,MRMLLFFDLPSVTNSDLKEYRKFRKFLIENGFSMLQESVYSKLLLH...
2,RN99_05240,-,GCA_001296125.1,CP012714.1,993.0,ALF19894_1,+,cd09720,cas1,CAS-II-A,Fusobacterium_nucleatum_sub_vincentii_ChDC_F8_...,1101080,1101958,CTATAACTCATCTTGAAAAAATCTCACTAATGATAAATCATTTGAG...,MSGWRVIIVTGRGKLDLRYNSISIRRDNGTDFIHIGEVNTLILETT...
3,RN99_05245,-,GCA_001296125.1,CP012714.1,994.0,ALF19895_1,+,mkCas0193,cas9,CAS-II-A,Fusobacterium_nucleatum_sub_vincentii_ChDC_F8_...,1101985,1106109,TTATAGTTTAATTTTCTTTACAAAAAGCCCTGTAACTGATTCTTCT...,MKKQKFSDYYLGFDIGTNSVGWCVTDLDYNVLRFNKKDMWGSRLFD...
4,Tel_12180,-,GCA_001447805.1,CP013099.1,2369.0,ALP53829_1,+,pfam09618,cas6f,CAS-I-F,Candidatus_Tenderia_electrophaga,2660425,2660991,TCAAAACCAAGGAATGGTGGCTTCGTTACTCAGACCGTAGGTGTTG...,MNRYQNIKILPDPEFPAPMLINALFAKLHRALVALQSREIGVSFPK...


In [28]:
# Load ESM-2 model
torch.cuda.empty_cache()
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

# device = "cpu" # My GPU doesn't have enough VRAM
# model.to(device)
device

'cuda'

In [29]:
# Prepare data into format [ (label, seq), ]. We also cut * end of protein sequence symbol
# I leave Loci_id as an unicue identificator of an entry and Gene_family as a target label
data = list()
for id, seq in cas_voc.iterrows():
    data.append((f">{seq.Gene_id}|{seq.Gene_family}|{seq.Loci_id}", seq.Prot))


In [30]:
# Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
# Example:
''' data = [
    ("protein1", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"),
    ("protein2", "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARAAAAAAAAAAAAAAAAAAAAAAAAAAKGVIEIVSGASRGIRLLQEE"),
    ("protein2 with mask","KALTARQQEVFDLIRD<mask>ISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
    ("protein3",  "K A <mask> I S Q"),
    ("pr4", "")
] '''
# Takes in data in format of [ (label, seq), ] list. Applies tokes preprocessing
# Returns: only_lables_batched, only_seqs_batched, seq_tokenized_batched 
data = data[:]
batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1) # returns lengths of tokenized seqs without padding

#batch_tokens = batch_tokens.to(device) # batch_converter automatically detects and moves data to gpu. But mine has too little VRAM
#print(np.array(batch_tokens.to("cpu")).shape)


In [31]:
batch_tokens.shape

torch.Size([40, 1376])

In [33]:
# Run the model and get sequence representations

# Extract per-residue representations (on CPU)
# Makes sence to extract only lasta layer representations. For 650M model it's layer 33
model = model.to(device)
batch_tokens_slice = batch_tokens.to(device)
with torch.inference_mode():
    results = model(batch_tokens, repr_layers=[33])
token_representations = results["representations"][33].to("cpu") # 1280 dimentional (for 650M model) representations for each residue in each data entry
token_representations.cpu()
print(np.array(token_representations).shape)


OutOfMemoryError: ignored

In [21]:
# Generate per-sequence representations via averaging
# NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
sequence_representations = []
print(np.array(token_representations).shape)
print(batch_lens)
for i, tokens_len in enumerate(batch_lens):
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
print(np.array(sequence_representations).shape)

(20, 1376, 1280)
tensor([ 222,  103,  294, 1376,  190,  350,  325,  437, 1084,  328,  323, 1126,
         418,  296,  339,  205,  230,  319,  243, 1125])
(20,)


  print(np.array(sequence_representations).shape)
  print(np.array(sequence_representations).shape)


In [27]:
import gc
torch.cuda.empty_cache()
gc.collect()

8

In [16]:
del batch_tokens