In [10]:

from urllib import request
from tqdm import tqdm
import os, sys
from collections import OrderedDict
import numpy as np
import csv

class ProteinSequence:
    """
    Simple object for storing protein info
    """
    def __init__(self, sequence: str=None, protein_id: str=None, filename: str=None, info: str=None):
        self.info = info
        self.seq = sequence
        self.protein_id = protein_id
        self.len = None
        self.ligand_idxs = [] # Datapoint line idxs in papyrus
        self.n_ligands = 0  # Number of datapoints in papyrus
    
    def __len__(self):
        return len(self.seq)

    def __repr__(self):
        return f"Protein Seq: {self.protein_id}, length: \
                {len(self)}, ligands: {self.n_ligands}"


def get_names(target_proteins):
    """
    Returns a dictionary of protein names for the target proteins (from Fasta headers)
    """
    folder = '/home/andrius/datasets/foldedPapyrus/proteins'
    proteins = [load_protein_sequence(pid, folder) for pid in target_proteins]
    headers = [p.info for p in proteins]
    name_strings = [h.split('|')[2].split('OS')[0] for h in headers]
    tags = [n.split(' ')[0] for n in name_strings]
    names = [' '.join([n.split(' ')[1:]]) for n in name_strings]
    target_proteins.sort()
    tag_dict  = OrderedDict(dict(zip(target_proteins, tags)))
    name_dict = OrderedDict(dict(zip(target_proteins, names)))
    
    return name_dict, tag_dict

#def download_fasta(protein_id, output_folder):
#    """
#    Retrieves the fasta file from Uniprot based on protein ID
#    """
#    try:
#        # remote_url = f'https://uniprot.org/uniprot/{protein_id}.fasta'
#        remote_url = f'https://rest.uniprot.org/uniprotkb/{protein_id}.fasta'
#        local_file = f'{output_folder}/{protein_id}.fasta'
#        # open(local_file, 'a').close()
#        request.urlretrieve(remote_url, local_file)
#    except:
#        print(f'Fasta file for {protein_id} could not be downloaded.')

def download_fasta(protein_id, output_folder):
    """
    Retrieves the fasta file from Uniprot based on protein ID using the REST API
    """
    try:
        # Use the correct UniProt REST API URL
        remote_url = f'https://rest.uniprot.org/uniprotkb/{protein_id}.fasta'
        local_file = f'{output_folder}/{protein_id}.fasta'
        
        # Download the file
        request.urlretrieve(remote_url, local_file)
        
        # Verify the file was downloaded and contains data
        if os.path.getsize(local_file) == 0:
            print(f'Fasta file for {protein_id} is empty. Protein may not exist in UniProt.')
            os.remove(local_file)  # Remove empty file
            return False
            
    except Exception as e:
        print(f'Fasta file for {protein_id} could not be downloaded. Error: {str(e)}')
        # Remove any partially downloaded file
        if os.path.exists(local_file):
            os.remove(local_file)
        return False
    
    return True


def read_fasta(file_path):
    """
    Reads a .fasta file and returns a Protein Sequence object
    """
    try:

        with open(file_path, 'r') as fasta:
            content = fasta.readlines()
            header = content[0]
            sequence = content[1:]
            stripped = [line.strip('\n') for line in sequence]
            sequence = ''.join(stripped)
            protein_id = header.split('|')[1].strip()
        
        protein_seq = ProteinSequence(sequence=sequence, protein_id=protein_id, info=header)
        return protein_seq
    except:
        return None

def load_protein_sequence(protein_id, data_folder) -> ProteinSequence:
    """
    If needed download the .fasta file based on protein ID and return a 
    ProteinSequence object
    """

    # Check if downloaded
    filename = f'{data_folder}/{protein_id}.fasta'
    if not os.path.isfile(filename):
        download_fasta(protein_id, data_folder)
        
    protein_seq = read_fasta(filename)
    return protein_seq

def get_papyrus_proteins(papyrus_file, output_folder, start=0, end=61085165):
    """
    Scans through the whole papyrus dataset, and downloads all required .fasta files from 
    uniprot, based on protein accesion IDs (attribute[9] in papyrus)

    Returns a dict of ProteinSequence objects indexed via protein_ids
    """
    if not os.path.isfile(papyrus_file):
        print('Papyrus file not found.')
        sys.exit()

    end = 61085165 if not end else end
    
    with open(papyrus_file, 'r') as papyrus:

        header = papyrus.readline()
        proteins = {}

        for idx in tqdm(range(start, end)):
            entry = papyrus.readline()
            attributes = entry.split('\t')
            protein_id = attributes[9]

            if not proteins.get(protein_id):
                p_sequence = load_protein_sequence(protein_id, data_folder=output_folder)
                proteins[protein_id] = p_sequence
            else:
                proteins[protein_id].n_ligands += 1
    return proteins

def get_proteins(pids, output_folder):
    """
    Downloads all required .fasta files from 
    uniprot, based on protein accesion IDs (attribute[9] in papyrus)

    Returns a dict of ProteinSequence objects indexed via protein_ids
    """
    proteins = {}
    for protein_id in tqdm(pids):
        p_sequence = load_protein_sequence(protein_id, data_folder=output_folder)
        proteins[protein_id] = p_sequence
    return proteins


def get_families(papyrus_targets_file: str, targets: list):

    with open(papyrus_targets_file, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        header = next(reader)
        families = []
        target_families = {}
        for line in reader:
            target = line[0].strip('_WT')
            classes = line[5].split('->')
            family = classes[1] if len(classes) > 1 else classes[0]
            if family == '':
                family = 'Unknown'
            families += [family]
            target_families[target] = family

    families = list(set(families))
    ## Create a dictionary of families and their members

    family_members = {}
    for protein in targets:
        family = target_families[protein]
        if family_members.get(family) is None:
            family_members[family] = [protein]
        else:
            family_members[family] += [protein]

    ## Rename some families
    modified_families = OrderedDict()
    modified_families['Other'] = []
    modified_families['G protein-coupled receptor'] = []
    cutoff = 25 # Number of families to keep, rest will be grouped into 'Other'

    for family in families[:cutoff]:
        if 'Other' in family:
            modified_families['Other'] += family_members[family]
        # Rename all GPCRs
        elif 'protein-coupled' in family:
            modified_families['G protein-coupled receptor'] += family_members[family]
        else:
            modified_families[family] = family_members[family]

    # All small families are now in 'Other' category
    for family in families[cutoff:]:
        modified_families['Other'] += family_members[family]

    #######
    # Create a dictionary of families that is ordered by the number of members
    families, family_members = zip(*modified_families.items())
    n_members = [len(v) for v in family_members]
    # Get sort order
    order = np.argsort(n_members)[::-1]
    n_members.sort()
    fams = list(families)
    families = [fams[i] for i in order]
    targets = [family_members[i] for i in order]
    ordered_families = OrderedDict(zip(families, n_members[::-1]))
    families = list(family_members.keys())

    return ordered_families

def parse_family_data(papyrus_targets_file: str):

    with open(papyrus_targets_file, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        header = next(reader)
        families = []
        target_families = {}
        for line in reader:
            target = line[0].strip('_WT')
            classes = line[5].split('->')
            family = classes[1] if len(classes) > 1 else classes[0]
            if family == '':
                family = 'Unknown'
            families += [family]
            target_families[target] = family

    families = list(set(families))
    ## Create a dictionary of families and their members

    family_members = {}
    for protein in targets:
        family = target_families[protein]
        if family_members.get(family) is None:
            family_members[family] = [protein]
        else:
            family_members[family] += [protein]

    ## Rename some families
    modified_families = OrderedDict()
    modified_families['Other'] = []
    modified_families['G protein-coupled receptor'] = []
    cutoff = 25 # Number of families to keep, rest will be grouped into 'Other'

    for family in families[:cutoff]:
        if 'Other' in family:
            modified_families['Other'] += family_members[family]
        # Rename all GPCRs
        elif 'protein-coupled' in family:
            modified_families['G protein-coupled receptor'] += family_members[family]
        else:
            modified_families[family] = family_members[family]

    # All small families are now in 'Other' category
    for family in families[cutoff:]:
        modified_families['Other'] += family_members[family]

    #######
    # Create a dictionary of families that is ordered by the number of members
    families, family_members = zip(*modified_families.items())
    n_members = [len(v) for v in family_members]
    # Get sort order
    order = np.argsort(n_members)[::-1]
    n_members.sort()
    fams = list(families)
    families = [fams[i] for i in order]
    targets = [family_members[i] for i in order]
    ordered_families = OrderedDict(zip(families, n_members[::-1]))
    families = list(family_members.keys())

    return ordered_families

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
target_df = pd.read_csv('prot_orgs.csv', sep=',')

In [4]:
target_df.iloc[50]

name2_entry         SLCO2B3
human_uniprot_id        NaN
mouse_uniprot_id        NaN
rat_uniprot_id          NaN
Name: 50, dtype: object

In [5]:
targets = target_df['human_uniprot_id'].tolist()
print(len(targets))

52


In [6]:
os.makedirs('proteins', exist_ok=True)
p = load_protein_sequence(targets[0], data_folder='proteins')
print(p)

Protein Seq: P05177, length:                 516, ligands: 0


In [12]:
os.makedirs('serra', exist_ok=True)

proteins = get_proteins(targets, output_folder='serra')

  0%|          | 0/52 [00:00<?, ?it/s]

100%|██████████| 52/52 [00:11<00:00,  4.44it/s]

Fasta file for nan could not be downloaded. Error: HTTP Error 400: Bad Request





In [14]:
sequences = [p.seq if p is not None else None for p in proteins.values()]
print(sequences)
print(len(sequences))

['MALSQSVPFSATELLLASAIFCLVFWVLKGLRPRVPKGLKSPPEPWGWPLLGHVLTLGKNPHLALSRMSQRYGDVLQIRIGSTPVLVLSRLDTIRQALVRQGDDFKGRPDLYTSTLITDGQSLTFSTDSGPVWAARRRLAQNALNTFSIASDPASSSSCYLEEHVSKEAKALISRLQELMAGPGHFDPYNQVVVSVANVIGAMCFGQHFPESSDEMLSLVKNTHEFVETASSGNPLDFFPILRYLPNPALQRFKAFNQRFLWFLQKTVQEHYQDFDKNSVRDITGALFKHSKKGPRASGNLIPQEKIVNLVNDIFGAGFDTVTTAISWSLMYLVTKPEIQRKIQKELDTVIGRERRPRLSDRPQLPYLEAFILETFRHSSFLPFTIPHSTTRDTTLNGFYIPKKCCVFVNQWQVNHDPELWEDPSEFRPERFLTADGTAINKPLSEKMMLFGMGKRRCIGEVLAKWEIFLFLAILLQQLEFSVPPGVKVDLTPIYGLTMKHARCEHVQARLRFSIN', 'MELSVLLFLALLTGLLLLLVQRHPNTHDRLPPGPRPLPLLGNLLQMDRRGLLKSFLRFREKYGDVFTVHLGPRPVVMLCGVEAIREALVDKAEAFSGRGKIAMVDPFFRGYGVIFANGNRWKVLRRFSVTTMRDFGMGKRSVEERIQEEAQCLIEELRKSKGALMDPTFLFQSITANIICSIVFGKRFHYQDQEFLKMLNLFYQTFSLISSVFGQLFELFSGFLKYFPGAHRQVYKNLQEINAYIGHSVEKHRETLDPSAPKDLIDTYLLHMEKEKSNAHSEFSHQNLNLNTLSLFFAGTETTSTTLRYGFLLMLKYPHVAERVYREIEQVIGPHRPPELHDRAKMPYTEAVIYEIQRFSDLLPMGVPHIVTQHTSFRGYIIPKDTEVFLILSTALHDPHYFEKPDAFNPDHFLDANGALKKTEAFIPFSLGKRICLGEGIARAELFLFFTTILQNFSMASPVAPEDIDLTPQECGVG

In [15]:
target_df['sequence'] = sequences

In [16]:
target_df.head()

Unnamed: 0,name2_entry,human_uniprot_id,mouse_uniprot_id,rat_uniprot_id,sequence
0,CYP1A2,P05177,P00186,P04799,MALSQSVPFSATELLLASAIFCLVFWVLKGLRPRVPKGLKSPPEPW...
1,CYP2B6,P20813,,,MELSVLLFLALLTGLLLLLVQRHPNTHDRLPPGPRPLPLLGNLLQM...
2,CYP2C9,P11712,,,MDSLVVLVLCLSCLLLLSLWRQSSGRGKLPPGPTPLPVIGNILQIG...
3,CYP2C19,P33261,,,MDPFVVLVLCLSCLLLLSIWRQSSGRGKLPPGPTPLPVIGNILQID...
4,CYP2D6,P10635,,,MGLEALVPLAVIVAIFLLLVDLMHRRQRWAARYPPGPLPLPGLGNL...


In [None]:
target_df.tail()   

Unnamed: 0,name2_entry,human_uniprot_id,mouse_uniprot_id,rat_uniprot_id,sequence
46,SMPDL3A,Q92484,P70158,Q641Z7,MALVRALVCCLLTAWHCRSGLGLPVAPAGGRNPPPAIGQFWHVTDL...
47,GABPA,Q06546,Q00422,A0A8I6GML2,MTKREAEELIEIEIDGTEKAECTEESIVEQTYAPAECVSQAIDINE...
48,HRH1,P35367,P70174,P31390,MSLPNSSCLLEDKMCEGNKTTMASPQLMPLVVVLSTICLVTVGLNL...
49,SLCO1B1,Q9Y6L6,,,MDQNQHLNKTAEAQPSENKKTRYCNGLKMFLAALSLSFIAKTLGAI...
51,SLCO2B1,O94956,Q8BXB6,Q9JHI3,MGPRIGPAGEVPQVPDKETKATMGTENTPGGKASPDPQDVRPSVFH...


In [23]:
import torch
import esm
import time
import numpy as np
import warnings

# Optional: Suppress a common warning from a specific ESM version about a deprecated function
warnings.filterwarnings("ignore", category=UserWarning, module='esm.pretrained')

def get_esm_embeddings(protein_sequences, model_name="esm2_t33_650M_UR50D", 
                       repr_layer=None, include_bos_eos=False,
                       truncation_seq_length=None, device=None):
    """
    Extracts protein embeddings using an ESM model.

    Args:
        protein_sequences (list of str or str): A list of protein sequences (e.g., ["MKTV...", "MSK..."])
                                                 or a single protein sequence string.
        model_name (str): Name of the ESM model to use.
                          Examples: "esm2_t6_8M_UR50D", "esm2_t12_35M_UR50D",
                                    "esm2_t30_150M_UR50D", "esm2_t33_650M_UR50D",
                                    "esm1b_t33_650M_UR50S" (ESM-1b)
        repr_layer (int, optional): The layer from which to extract representations.
                                    If None, defaults to the last layer.
        include_bos_eos (bool): Whether to include the embeddings for BOS (Beginning Of Sequence)
                                and EOS (End Of Sequence) tokens in per-residue embeddings.
                                For sequence-level embeddings, these are typically excluded before averaging.
        truncation_seq_length (int, optional): If set, sequences longer than this will be truncated.
                                               The ESM models have a context window (e.g., 1024 for ESM-1b,
                                               ESM-2 can often handle longer based on memory).
                                               If None, no explicit truncation by this function, but model
                                               might have its own limits.
        device (str, optional): "cuda" for GPU, "cpu" for CPU. If None, autodetects.

    Returns:
        dict: A dictionary with keys:
            'per_residue_embeddings' (list of np.ndarray):
                List of per-residue embeddings. Each element is a NumPy array of shape
                (seq_len, embedding_dim). seq_len depends on `include_bos_eos`.
            'sequence_embeddings' (list of np.ndarray):
                List of sequence-level embeddings (mean-pooled over residues).
                Each element is a NumPy array of shape (embedding_dim,).
            'model_name' (str): The name of the model used.
            'representation_layer' (int): The layer number from which embeddings were extracted.
    """
    if isinstance(protein_sequences, str):
        protein_sequences = [protein_sequences]

    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load ESM model
    try:
        model, alphabet = esm.pretrained.load_model_and_alphabet(model_name)
    except Exception as e:
        print(f"Error loading model {model_name}. Available models from esm.pretrained.ESM_PRETRAINED_MODEL_WEIGHTS:")
        # print(esm.pretrained.ESM_PRETRAINED_MODEL_WEIGHTS) # Might be too verbose
        print(f"Common choices: esm2_t33_650M_UR50D, esm1b_t33_650M_UR50S, esm2_t6_8M_UR50D")
        raise e

    model = model.to(device)
    model.eval()  # Set model to evaluation mode

    if repr_layer is None:
        repr_layer = model.num_layers # Default to the last layer
    
    print(f"Extracting embeddings from layer: {repr_layer}")
    print(f"Model has {model.num_layers} layers and embedding dimension {model.embed_dim}.")


    batch_converter = alphabet.get_batch_converter()

    per_residue_embeddings_list = []
    sequence_embeddings_list = []

    # Prepare data for batching
    # The batch_converter expects a list of tuples (name, sequence)
    data_for_batching = []
    for i, seq in enumerate(protein_sequences):
        seq_id = f"protein_{i+1}"
        if truncation_seq_length is not None and len(seq) > truncation_seq_length:
            print(f"Warning: Sequence {seq_id} (length {len(seq)}) is longer than truncation_seq_length ({truncation_seq_length}). Truncating.")
            seq = seq[:truncation_seq_length]
        data_for_batching.append((seq_id, seq))
    
    # Batch conversion
    # This adds BOS and EOS tokens, and handles padding for sequences of different lengths in a batch
    batch_labels, batch_strs, batch_tokens = batch_converter(data_for_batching)
    batch_tokens = batch_tokens.to(device)

    with torch.no_grad():
        # Extract per-residue representations
        # `results` is a dictionary with various outputs
        results = model(batch_tokens, repr_layers=[repr_layer], return_contacts=False)
        
        # token_representations shape: (batch_size, seq_len_with_bos_eos, embed_dim)
        token_representations = results["representations"][repr_layer]

    # Process each sequence in the batch
    for i, protein_seq_str in enumerate(batch_strs):
        # token_repr shape: (seq_len_with_bos_eos, embed_dim)
        token_repr = token_representations[i] 

        # Per-residue embeddings
        if include_bos_eos:
            # Includes BOS and EOS tokens
            per_residue_emb = token_repr.cpu().numpy()
        else:
            # Exclude BOS and EOS token embeddings
            # BOS is at index 0, EOS is at len(protein_seq_str) + 1
            per_residue_emb = token_repr[1 : len(protein_seq_str) + 1].cpu().numpy()
        per_residue_embeddings_list.append(per_residue_emb)

        # Sequence-level embedding (mean pooling over actual residues)
        # We use token_repr[1 : len(protein_seq_str) + 1] to average only over actual amino acid residues
        sequence_emb = token_repr[1 : len(protein_seq_str) + 1].mean(0).cpu().numpy()
        sequence_embeddings_list.append(sequence_emb)
        
    return {
        "per_residue_embeddings": per_residue_embeddings_list,
        "sequence_embeddings": sequence_embeddings_list,
        "model_name": model_name,
        "representation_layer": repr_layer
    }

# --- Example Usage ---
def process_sequence(seq, model_name="esm2_t12_35M_UR50D", repr_layer=12, include_bos_eos=False):
    print("--- Processing a list of sequences with default ESM-2 model ---")
    start_time = time.time()

    print("\n--- Example with GPU (if available) and larger ESM-2 model ---")
    if torch.cuda.is_available():
        start_time = time.time()
        try:
            embeddings_data_gpu = get_esm_embeddings(
                seq, # Just one sequence for quicker GPU demo
                model_name="esm2_t33_650M_UR50D", # Larger model
                # repr_layer will default to last layer (33)
                device="cuda"
            )
            end_time = time.time()
            print(f"Time taken on GPU: {end_time - start_time:.2f} seconds")
            print(f"Model used: {embeddings_data_gpu['model_name']}")
            print(f"Representation layer: {embeddings_data_gpu['representation_layer']}")
            per_res_emb_g = embeddings_data_gpu["per_residue_embeddings"][0]
            seq_emb_g = embeddings_data_gpu["sequence_embeddings"][0]
            print(f"  Per-residue embedding shape: {per_res_emb_g.shape}")
            print(f"  Sequence embedding shape: {seq_emb_g.shape}")

        except Exception as e:
            print(f"An error occurred during GPU processing: {e}")
    else:
        print("CUDA (GPU) not available. Skipping GPU example.")

    return seq_emb_g

In [25]:
# Drop SLC
# print(target_df.loc[50])


In [26]:
# Import tqdm for displaying progress bars during iteration
from tqdm import tqdm
emb_dict = {}

for i, row in tqdm(target_df.iterrows()):

    pid = i
    seq = row['sequence']
    if seq is None:
        print(f"Skipping {pid} due to missing sequence.")
        continue
    emb = process_sequence(seq)
    emb_dict[pid] = emb

0it [00:00, ?it/s]

--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /home/serramelendezcsm/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /home/serramelendezcsm/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt
Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 100.06 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (516, 1280)
  Sequence embedding shape: (1280,)


1it [01:40, 100.10s/it]

--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


2it [01:44, 43.57s/it] 

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.99 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (491, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


3it [01:47, 25.25s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.44 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (490, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


4it [01:51, 16.70s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.61 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (490, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


5it [01:55, 12.08s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.89 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (497, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


6it [01:58,  9.13s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (503, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


7it [02:01,  7.28s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.46 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (1338, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


8it [02:05,  6.06s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.45 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (1333, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


9it [02:08,  5.23s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.41 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (527, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


10it [02:12,  4.66s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (520, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


11it [02:15,  4.27s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (501, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


12it [02:18,  3.99s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.36 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (375, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


13it [02:22,  3.80s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.36 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (292, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


14it [02:25,  3.67s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.36 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (331, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


15it [02:28,  3.58s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.36 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (532, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


16it [02:32,  3.52s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (848, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


17it [02:35,  3.48s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.36 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (352, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


18it [02:39,  3.44s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.36 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (434, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


19it [02:42,  3.42s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.36 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (295, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


20it [02:45,  3.40s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.35 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (222, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


21it [02:49,  3.42s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.48 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (435, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


22it [02:52,  3.40s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.35 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (201, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


23it [02:56,  3.42s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.46 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (1159, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


24it [02:59,  3.46s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.55 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (2016, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


25it [03:03,  3.44s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (598, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


26it [03:06,  3.42s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (178, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


27it [03:09,  3.40s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (481, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


28it [03:13,  3.40s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (630, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda
Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.


29it [03:16,  3.45s/it]

Time taken on GPU: 3.58 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (2240, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


30it [03:20,  3.43s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (620, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


31it [03:23,  3.43s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.40 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (617, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


32it [03:26,  3.41s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (227, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


33it [03:30,  3.41s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.40 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (572, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


34it [03:33,  3.40s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.38 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (465, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


35it [03:37,  3.39s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (477, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


36it [03:40,  3.39s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (413, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


37it [03:43,  3.38s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.38 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (164, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


38it [03:47,  3.38s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (360, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


39it [03:50,  3.38s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.38 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (460, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


40it [03:53,  3.38s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (466, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


41it [03:57,  3.42s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.50 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (590, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


42it [04:00,  3.41s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.38 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (450, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


43it [04:04,  3.40s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (479, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


44it [04:07,  3.39s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.38 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (502, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


45it [04:10,  3.39s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (505, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


46it [04:14,  3.39s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (468, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


47it [04:17,  3.38s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.37 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (453, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


48it [04:21,  3.38s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.38 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (454, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


49it [04:24,  3.38s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.38 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (487, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


50it [04:27,  3.39s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (691, 1280)
  Sequence embedding shape: (1280,)
--- Processing a list of sequences with default ESM-2 model ---

--- Example with GPU (if available) and larger ESM-2 model ---
Using device: cuda


51it [04:31,  5.32s/it]

Extracting embeddings from layer: 33
Model has 33 layers and embedding dimension 1280.
Time taken on GPU: 3.39 seconds
Model used: esm2_t33_650M_UR50D
Representation layer: 33
  Per-residue embedding shape: (709, 1280)
  Sequence embedding shape: (1280,)





In [None]:
#with open('pcmol_targets.txt', 'r') as f:
#    pcmol_targets = f.readlines()

In [None]:
## Check overlap between serra and pcmol targets
#targets_set = set(target_df['human_uniprot_id'].tolist())
#pcmol_targets_set = set([line.strip() for line in pcmol_targets])
#overlap = targets_set.intersection(pcmol_targets_set)
#print(f'Length of targets: {len(targets_set)}, Length of Pcmol targets: {len(pcmol_targets_set)}')
#print(f"Overlap between current and Pcmol targets: {len(overlap)}")

Length of targets: 51, Length of Pcmol targets: 0
Overlap between current and Pcmol targets: 0


In [None]:
## Check overlap between serra and pcmol targets
#targets_set = set(target_df['human_uniprot_id'].tolist())
#pcmol_targets_set = set([line.strip() for line in pcmol_targets])
#overlap = targets_set.intersection(pcmol_targets_set)
#print(f'Length of targets: {len(targets_set)}, Length of Pcmol targets: {len(pcmol_targets_set)}')
#print(f"Overlap between current and Pcmol targets: {len(overlap)}")

Length of targets: 51, Length of Pcmol targets: 0
Overlap between current and Pcmol targets: 0


In [None]:
## Check overlap between serra and pcmol targets
#targets_set = set(target_df['human_uniprot_id'].tolist())
#pcmol_targets_set = set([line.strip() for line in pcmol_targets])
#overlap = targets_set.intersection(pcmol_targets_set)
#print(f'Length of targets: {len(targets_set)}, Length of Pcmol targets: {len(pcmol_targets_set)}')
#print(f"Overlap between current and Pcmol targets: {len(overlap)}")

Length of targets: 51, Length of Pcmol targets: 0
Overlap between current and Pcmol targets: 0


In [31]:
protein_ids = np.array(list(emb_dict.keys()))
embeddings = np.array(list(emb_dict.values()))
np.save('embeddings.npy', embeddings)

In [32]:
target_df.to_csv('targets_w_sequences.csv', index=False)

In [33]:
embeddings[0]

array([-0.02037282, -0.09056199, -0.07410282, ..., -0.12021448,
       -0.04532661,  0.11132984], dtype=float32)

In [34]:
from sklearn.decomposition import PCA
import numpy as np

# Convert dictionary to array for PCA
embeddings_array = np.array(list(emb_dict.values()))
print(embeddings_array.shape)
# embeddings_array = embeddings_array.squeeze(1)

# Initialize PCA with 256 components
pca = PCA(n_components=2)

# Fit and transform the embeddings
reduced_embeddings = pca.fit_transform(embeddings_array)

# Create a dictionary mapping protein IDs to their reduced embeddings
reduced_emb_dict = {pid: emb for pid, emb in zip(emb_dict.keys(), reduced_embeddings)}

# Print some information about the dimensionality reduction
print(f"Original embedding shape: {embeddings_array.shape}")
print(f"Reduced embedding shape: {reduced_embeddings.shape}")
print(f"Explained variance ratio: {sum(pca.explained_variance_ratio_):.3f}")



(51, 1280)
Original embedding shape: (51, 1280)
Reduced embedding shape: (51, 2)
Explained variance ratio: 0.545


In [35]:
target_df.head()

Unnamed: 0,name2_entry,human_uniprot_id,mouse_uniprot_id,rat_uniprot_id,sequence
0,CYP1A2,P05177,P00186,P04799,MALSQSVPFSATELLLASAIFCLVFWVLKGLRPRVPKGLKSPPEPW...
1,CYP2B6,P20813,,,MELSVLLFLALLTGLLLLLVQRHPNTHDRLPPGPRPLPLLGNLLQM...
2,CYP2C9,P11712,,,MDSLVVLVLCLSCLLLLSLWRQSSGRGKLPPGPTPLPVIGNILQIG...
3,CYP2C19,P33261,,,MDPFVVLVLCLSCLLLLSIWRQSSGRGKLPPGPTPLPVIGNILQID...
4,CYP2D6,P10635,,,MGLEALVPLAVIVAIFLLLVDLMHRRQRWAARYPPGPLPLPGLGNL...


In [36]:
papyrus_targets.head() 

NameError: name 'papyrus_targets' is not defined

In [None]:
papyrus_targets.head() 

NameError: name 'papyrus_targets' is not defined

In [None]:
papyrus_targets.head() 

NameError: name 'papyrus_targets' is not defined

In [None]:
embeddings[0]

array([-0.02037282, -0.09056199, -0.07410282, ..., -0.12021448,
       -0.04532661,  0.11132984], dtype=float32)

In [38]:
emb_df.head()

NameError: name 'emb_df' is not defined

In [None]:
emb_df = pd.DataFrame(reduced_emb_dict)
emb_df = emb_df.transpose()
emb_df['accession']=emb_df.index
emb_df['accession']=emb_df['accession'].astype(int)

## Targets dataframe
papyrus_targets = pd.read_csv('/home/andrius/datasets/molecules/subsets/all/targets.csv')
## Rename target_id to accession
target_df.rename(columns={'target_id': 'accession'}, inplace=True)
print(targets.columns)
## Add 'c3' column from targets to poses_grouped

# klifs = pd.read_csv('klifs.csv')
# print(klifs.columns)

## Rename target_df.human_uniprot_id to accession
target_df.rename(columns={'human_uniprot_id': 'accession'}, inplace=True)
papyrus_targets.rename(columns={'target_id': 'accession'}, inplace=True)
print(targets.columns)

# targ_df = target_df.merge(papyrus_targets[['accession', 'c3']], on='accession', how='left')


## Remove NaN from pchembl_value_Meao
# 
# 
emb_df = emb_df.merge(targ_df, on='accession', how='left')



Index(['accession', 'UniProtID', 'mols_in_papyrus', 'mols_in_papyrus_pp',
       'name', 'Status', 'Organism', 'Classification', 'Length', 'Sequence',
       'TID', 'family', 'c1', 'c2', 'c3', 'c4', 'c5', 'c1_', 'c1_*', 'orphan'],
      dtype='object')
Index(['accession', 'UniProtID', 'mols_in_papyrus', 'mols_in_papyrus_pp',
       'name', 'Status', 'Organism', 'Classification', 'Length', 'Sequence',
       'TID', 'family', 'c1', 'c2', 'c3', 'c4', 'c5', 'c1_', 'c1_*', 'orphan'],
      dtype='object')


ValueError: You are trying to merge on int64 and object columns for key 'accession'. If you wish to proceed you should use pd.concat

In [44]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plt.figure(figsize=(10, 7)) # Adjust figure size for better legend placement if needed
    
sns.set_style('white')


sns.scatterplot(data=emb_df, x=0, y=1, hue='c3', markers='', palette='tab20', linewidth=1, alpha=0.8, edgecolor='k', s=100)
plt.legend(bbox_to_anchor=(1,1), fontsize=20)
plt.xlabel('PC1', fontweight='bold', fontsize=20)
plt.gca().spines['top'].set_visible(True)
plt.gca().spines['right'].set_visible(True)
plt.gca().spines['bottom'].set_visible(True)

plt.gca().spines['left'].set_visible(True)
plt.gca().spines['top'].set_linewidth(2)
plt.gca().spines['right'].set_linewidth(2)
plt.gca().spines['bottom'].set_linewidth(2)
plt.gca().spines['left'].set_linewidth(2)

plt.ylabel('PC2', fontweight='bold', fontsize=20)
plt.xticks([])
plt.yticks([])
plt.show()

ValueError: Could not interpret value `c3` for `hue`. An entry with this name does not appear in `data`.

<Figure size 1000x700 with 0 Axes>

In [15]:
import umap
import umap.aligned_umap
import umap.validation

umap_reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)

# Perform UMAP on the 'embeddings' data
# Assuming 'embeddings' is a variable (e.g., NumPy array) containing the high-dimensional data
# and 'emb_df' is the DataFrame used for the PCA plot, containing the 'c3' column.

# Initialize UMAP. Using n_components=2 for a 2D plot.
# random_state is for reproducibility. Other parameters like n_neighbors and min_dist can be tuned.

# Fit UMAP to the embeddings and transform the data
umap_results = umap_reducer.fit_transform(embeddings)

# Create a pandas DataFrame with the UMAP results
df_umap = pd.DataFrame(data=umap_results, columns=['UMAP1', 'UMAP2'])

# Add the 'c3' column from the PCA plot's DataFrame for coloring
# This assumes that the rows in 'embeddings' correspond to the rows in 'emb_df'
df_umap['c3'] = emb_df['c3'].values

# Create the scatter plot, similar to the PCA plot

# sns.set_style('white') # Already set in the previous cell, but can be re-applied
sns.scatterplot(data=df_umap, x='UMAP1', y='UMAP2', hue='c3', markers='o', palette='tab20')
plt.legend(bbox_to_anchor=(1,1)) # Match PCA plot legend style
plt.xlabel('UMAP1') # Label for UMAP
plt.ylabel('UMAP2') # Label for UMAP

# Apply similar spine styling as the PCA plot
ax = plt.gca()
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
ax.spines['top'].set_linewidth(2)
ax.spines['right'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
ax.spines['left'].set_linewidth(2)

# Apply similar tick styling (empty ticks) as the PCA plot
plt.xticks([])
plt.yticks([])

plt.show()


AttributeError: module 'umap' has no attribute 'UMAP'