<a href="https://colab.research.google.com/github/Fulmenius/Predicting-antibody-escape-with-ML/blob/main_script/models/ProtT5_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install torch torchvision torchaudio transformers sentencepiece accelerate --extra-index-url https://download.pytorch.org/whl/cu116

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/, https://download.pytorch.org/whl/cu116


In [None]:
import pandas as pd

ACE2_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/ACE2_test_data.csv")


In [None]:
ACE2_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/ACE2_train_data.csv")


LY16_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/LY16_train_data.csv")
LY555_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/LY555_train_data.csv")
REGN33_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/REGN33_train_data.csv")
REGN87_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/REGN87_train_data.csv")

LY16_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/LY16_test_data.csv")
LY555_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/LY555_test_data.csv")
REGN33_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/REGN33_test_data.csv")
REGN87_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/REGN87_test_data.csv")

In [None]:
ACE2_test.head()

Unnamed: 0.1,Unnamed: 0,junction_aa,consensus_count,Label,Distance
0,478748,KNEQFNCYGPLPQYGFQRTYGLGY,1,0,7
1,543305,KNEGFNCYMPLNEYGFWRTWGRGY,1,0,7
2,128414,KNKGFNCYRPLWEYGFFRTSGVGW,1,1,8
3,369022,KNPGFNCYIPIRNYGFFTTVGQGW,2,1,10
4,314227,KNKGFNCYPPLQQYGFWTTTGSGW,1,1,8


In [None]:
from transformers import T5Tokenizer, T5EncoderModel
import torch
import pandas as pd
import numpy as np
import re

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
model = model.half() if device == torch.device('cuda:0') else model

def process_chunk(chunk):
    sequence_examples = chunk["junction_aa"].tolist()

    # replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
    sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]

    # tokenize sequences and pad up to the longest sequence in the batch
    ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")

    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    # generate embeddings
    with torch.no_grad():
        embedding_rpr = model(input_ids=input_ids,attention_mask=attention_mask)

    # compute per-protein embeddings and convert to numpy for use with pandas
    embeddings = [emb.mean(dim=0).cpu().numpy() for emb in embedding_rpr.last_hidden_state]

    # create new dataframe with embeddings and labels
    return pd.DataFrame({
        "embeddings": embeddings,
        "Labels": chunk["Label"].tolist()
    })


In [None]:
from tqdm import tqdm

def process_df(df, chunk_size=20):
    chunks = [df[i:i+chunk_size] for i in range(0, df.shape[0], chunk_size)]
    processed_chunks = []

    for chunk in tqdm(chunks, desc="Processing chunks"):
        processed_chunks.append(process_chunk(chunk))

    return pd.concat(processed_chunks)

In [None]:
ACE2_transformed = process_df(ACE2_test.sample(n=1000), 20)

Processing chunks: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]


In [None]:
ACE2_transformed["embeddings"].iloc[0].shape

(1024,)

In [None]:
ACE2_transformed.to_csv('/content/drive/MyDrive/Colab Notebooks/data/ACE2_embeddings_1000_test.csv')

In [None]:
antibodies_train = [LY16_train, LY555_train, REGN33_train, REGN87_train]
antibodies_test = [LY16_test, LY555_test, REGN33_test, REGN87_test]

In [None]:
antibodies_train_embeddings = []
antibodies_test_embeddings = []
names = ["LY16", "LY555", "REGN33", "REGN87"]

for dataset, name in zip(antibodies_train, names):
    embedded = process_df(dataset.sample(n=1000), 20)
    antibodies_train_embeddings.append(embedded)
    # Convert numpy arrays in the 'embeddings' column to lists before saving to CSV
    embedded['embeddings'] = embedded['embeddings'].apply(lambda x: x.tolist())
    embedded.to_csv('/content/drive/MyDrive/Colab Notebooks/data/' + name + "_embedding_1000_train.csv", index=False)


for dataset, name in zip(antibodies_test, names):
    embedded = process_df(dataset.sample(n=1000), 20)
    antibodies_test_embeddings.append(embedded)
    # Convert numpy arrays in the 'embeddings' column to lists before saving to CSV
    embedded['embeddings'] = embedded['embeddings'].apply(lambda x: x.tolist())
    embedded.to_csv('/content/drive/MyDrive/Colab Notebooks/data/' + name + "_embedding_1000_test.csv", index=False)

Processing chunks: 100%|██████████| 50/50 [00:03<00:00, 13.69it/s]
Processing chunks: 100%|██████████| 50/50 [00:03<00:00, 13.66it/s]
Processing chunks: 100%|██████████| 50/50 [00:03<00:00, 14.05it/s]
Processing chunks: 100%|██████████| 50/50 [00:03<00:00, 13.03it/s]
Processing chunks: 100%|██████████| 50/50 [00:04<00:00, 11.27it/s]
Processing chunks: 100%|██████████| 50/50 [00:03<00:00, 13.81it/s]
Processing chunks: 100%|██████████| 50/50 [00:03<00:00, 13.66it/s]
Processing chunks: 100%|██████████| 50/50 [00:03<00:00, 13.01it/s]


In [None]:
antibodies_train_embeddings = []
antibodies_test_embeddings = []
names = ["LY16", "LY555", "REGN33", "REGN87"]

for dataset, name in zip(antibodies_train, names):
    embedded = process_df(dataset, 20)
    antibodies_train_embeddings.append(embedded)
    # Convert numpy arrays in the 'embeddings' column to lists before saving to CSV
    embedded['embeddings'] = embedded['embeddings'].apply(lambda x: x.tolist())
    embedded.to_csv('/content/drive/MyDrive/Colab Notebooks/data/' + name + "_embedding_full_train.csv", index=False)

for dataset, name in zip(antibodies_test, names):
    embedded = process_df(dataset, 20)
    antibodies_test_embeddings.append(embedded)
    # Convert numpy arrays in the 'embeddings' column to lists before saving to CSV
    embedded['embeddings'] = embedded['embeddings'].apply(lambda x: x.tolist())
    embedded.to_csv('/content/drive/MyDrive/Colab Notebooks/data/' + name + "_embedding_full_test.csv", index=False)

Processing chunks: 100%|██████████| 1345/1345 [01:44<00:00, 12.92it/s]
Processing chunks: 100%|██████████| 755/755 [01:01<00:00, 12.31it/s]
Processing chunks: 100%|██████████| 6011/6011 [08:05<00:00, 12.37it/s]
Processing chunks: 100%|██████████| 1505/1505 [02:06<00:00, 11.86it/s]
Processing chunks: 100%|██████████| 150/150 [00:12<00:00, 11.61it/s]
Processing chunks: 100%|██████████| 84/84 [00:06<00:00, 12.53it/s]
Processing chunks: 100%|██████████| 668/668 [00:56<00:00, 11.92it/s]
Processing chunks: 100%|██████████| 168/168 [00:14<00:00, 11.98it/s]
