<a href="https://colab.research.google.com/github/Anacatlisboa23/Tese/blob/main/ESM_(Evolutionary_Scale_Modeling).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ESM (Evolutionary Scale Modeling)

In [1]:
pip install fair-esm


Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0


In [2]:
import torch
import esm
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from tqdm import tqdm
from multiprocessing import Pool, cpu_count


In [3]:
data=pd.read_csv("/content/FINAL (1).csv")

In [4]:
# como o dataset é grande vou usar apenas uma pequena parte para experimentar:

from sklearn.model_selection import train_test_split

label_column = 'Function'

subset_size = 0.05 #5% do dataset original
_, dados = train_test_split(data, test_size=subset_size, stratify=data[label_column])

In [5]:

# Carrega o modelo ESM-1b
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
batch_converter = alphabet.get_batch_converter()

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm1b_t33_650M_UR50S-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S-contact-regression.pt


In [6]:
# Função para converter uma sequência peptídica em embeddings usando ESM
def embed_peptide_sequence_esm(sequence):
    data = [("protein1", sequence)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33])

    token_representations = results["representations"][33]

    # Remove as representações do token especial [CLS]
    sequence_representations = token_representations[0, 1: len(sequence) + 1].numpy()

    # Retorna os embeddings médios de todos os tokens da sequência
    mean_embeddings = np.mean(sequence_representations, axis=0)

    return mean_embeddings


In [7]:
# Função auxiliar para processamento em paralelo
def process_sequence(seq):
    return embed_peptide_sequence_esm(seq)

In [8]:
# Dividir o DataFrame em subconjuntos
def split_dataframe(df, chunk_size):
    return [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

In [9]:
# Função para processar cada subconjunto e gerar o DataFrame com embeddings reduzidos
def process_chunk(df_chunk):
    sequences = df_chunk['Sequence'].tolist()
    with Pool(cpu_count()) as pool:
        embeddings = list(tqdm(pool.imap(process_sequence, sequences), total=len(sequences)))

    embeddings_matrix = np.vstack(embeddings)
    pca = PCA(n_components=50)
    reduced_embeddings = pca.fit_transform(embeddings_matrix)

    reduced_embeddings_df = pd.DataFrame(reduced_embeddings, columns=[f'PC_{i+1}' for i in range(reduced_embeddings.shape[1])])
    return pd.concat([df_chunk.reset_index(drop=True), reduced_embeddings_df], axis=1)


In [12]:
# Tamanho do subconjunto
chunk_size = 1000

# Dividir o DataFrame
#df_chunks = split_dataframe(data, chunk_size)
df_chunks = split_dataframe(dados, chunk_size) #para 0.05%

In [13]:
# Processar cada subconjunto e juntar os resultados
df_processed_chunks = [process_chunk(chunk) for chunk in df_chunks]
df_final = pd.concat(df_processed_chunks, axis=0).reset_index(drop=True)

100%|██████████| 728/728 [31:34<00:00,  2.60s/it]


In [14]:
df_final.to_csv("/content/FINAL_0.5_ESMb1.csv")

In [15]:
df_final

Unnamed: 0.1,Unnamed: 0,Sequence,Function,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,...,PC_41,PC_42,PC_43,PC_44,PC_45,PC_46,PC_47,PC_48,PC_49,PC_50
0,7307.0,MKKEVLLDGVKCAGCANTVQERFSAIEGVESVEVDLATKKAVLESQ...,non-AMP,-2.674324,2.519575,-0.612338,0.667265,-0.631979,-0.395262,0.204265,...,-0.362485,-0.140925,0.520332,-0.390600,0.638793,0.204886,-0.116771,-0.105083,-0.406770,0.507198
1,,GLEKRKRKFFNKIKFK,antibacterian group ESKAPE,1.330316,-2.619189,0.317850,0.182221,-1.073149,0.298117,-0.683268,...,0.122030,0.160346,-0.085324,-0.130178,-0.025545,-0.236075,0.130499,-0.092831,0.015581,-0.237114
2,,TTLTLHNLCPYPVWWLVTPNNGGFPIIDNTPVVLG,antibacterian group ESKAPE,-1.240406,0.695341,-0.461211,0.340580,0.680983,-0.695358,0.734466,...,-0.066311,0.336644,0.127019,-0.548323,-0.060573,-0.174036,-0.117946,-0.523621,-0.098581,-0.202020
3,6606.0,MDNGISQGAKLALKGEIQGNIISPYIFTETDPESVLAKEESFGPIL...,non-AMP,-1.985490,1.808194,-0.443566,1.031801,-0.467157,-0.532164,-0.224011,...,-0.591940,0.079630,0.204924,-0.030245,0.160973,-0.410326,0.106417,0.024906,0.000013,-0.329337
4,9137.0,MEIKMSYYHILIEVNDHISTIEQTRDIELFDIIELKPYLHSILLPY...,non-AMP,0.750147,-0.577800,-2.127717,-1.136671,0.127896,1.431369,-0.318355,...,-0.025148,0.245148,-0.229667,-0.145633,0.253054,0.351364,-0.138383,-0.021034,-0.124646,0.085957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,,FLPIVKKLLRQMF,antibacterian group ESKAPE,-1.272942,1.077833,2.475266,-1.780239,-0.336054,-0.566830,-0.344362,...,0.116095,-0.038327,0.014759,0.260599,0.156275,0.037586,0.114580,0.283540,-0.083782,0.249746
724,,VWRKWRRFWKR,antibacterian group ESKAPE,1.275946,-2.344171,1.472109,0.830356,0.798239,-0.012569,-1.681425,...,-0.193737,0.150957,0.085082,0.114268,-0.126475,-0.038716,-0.044470,-0.091786,-0.076254,0.048115
725,5320.0,MFDEDQHRATHDVAPSFIFKGFLISTNDQEYYYSETQIVDGVKQYL...,non-AMP,0.782304,-0.848615,-1.934166,-1.023245,0.503399,0.760285,0.093044,...,0.062515,-0.019718,0.071569,-0.227723,0.231112,-0.155339,-0.120899,0.050898,-0.032577,-0.051188
726,,SKVWRHWRRFWHRAHRLH,antibacterian group ESKAPE,1.640936,-2.627699,0.747579,1.128816,0.987672,-0.479797,-0.926586,...,0.097961,0.042749,0.061135,0.414840,-0.182114,-0.004878,0.081419,0.276158,0.115000,-0.199617
