#Identificar mismos ligandos en diferentes familias de proteínas

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install Biopython

Collecting Biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m2.4/3.3 MB[0m [31m72.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Biopython
Successfully installed Biopython-1.85


In [None]:
#@title Abrir el dataframe

import os
import pandas as pd

input_folder = "/content/drive/MyDrive/TFM/T1/LIGANDO_ENTERRADO" # Ruta del df armonizado sin entradas enterradas
input_file = os.path.join(input_folder, "df_final.csv") # Nombre df armonizado sin entradas enterradas
df_harm = pd.read_csv(input_file, sep = ',')
print(df_harm.shape[0],df_harm.columns)

83805 Index(['PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id', 'Ligand_id',
       'Ligand_InChi', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas'],
      dtype='object')


In [None]:
#@title Crear diccionarion con ligand y classification

grupos_ligandos_familias = {}
for index, ligando in enumerate(df_harm['Ligand_id']):
    if ligando == ligando:
        clasificacion = df_harm.loc[index, 'Classification']
        if ligando in grupos_ligandos_familias:
            if clasificacion not in grupos_ligandos_familias[ligando]:
                grupos_ligandos_familias[ligando].append(clasificacion)
        else:
            grupos_ligandos_familias[ligando] = [clasificacion]

In [None]:
#@title Crear el dataframe de los grupos ligando-familia proteína

data = []
for ligando, clasificaciones in grupos_ligandos_familias.items():
    for clasificacion in clasificaciones:
        data.append({"Ligand_id": ligando, "Classification": clasificacion})

df_grupos = pd.DataFrame(data)
print(df_grupos)

      Ligand_id    Classification
0           AIC  MEMBRANE PROTEIN
1           AIC         HYDROLASE
2           AIC     VIRAL PROTEIN
3           0JM  MEMBRANE PROTEIN
4           1RG  MEMBRANE PROTEIN
...         ...               ...
41252       DZO       TRANSFERASE
41253       G95       TRANSFERASE
41254       G96       TRANSFERASE
41255       G98       TRANSFERASE
41256       VRA       TRANSFERASE

[41257 rows x 2 columns]


In [None]:
#@title Identificar cuantos ligandos hay en total

unique = set(df_grupos['Ligand_id'])
print(f"En el dataframe hay: {len(unique)} ligandos únicos")

En el dataframe hay: 32219 ligandos únicos


In [None]:
#@title Identificar ligandos unidos a más proteínas de diferentes familias

conteos = df_grupos['Ligand_id'].value_counts()
no_solo = conteos[conteos > 1].index.tolist()

print(f"Total ligandos con más de una familia de proteína: {len(no_solo)}")

Total ligandos con más de una familia de proteína: 3363


In [None]:
#@title Crear el dataframe con los ligandos y las familias de proteínas

df_grupos_filtrado = df_grupos[df_grupos['Ligand_id'].isin(no_solo)]
df_grupos_filtrado

Unnamed: 0,Ligand_id,Classification
0,AIC,MEMBRANE PROTEIN
1,AIC,HYDROLASE
2,AIC,VIRAL PROTEIN
4,1RG,MEMBRANE PROTEIN
5,1RG,HYDROLASE
...,...,...
40560,23U,BLOOD CLOTTING
40640,W4A,SPLICING
40641,W4A,TRANSFERASE
40810,8YB,TRANSCRIPTION


In [None]:
#@title Crear el dataframe con ligandos que se unen a más de una familia de proteína

lista_ligandos = df_grupos_filtrado['Ligand_id']
print(df_harm.shape[0])
df_harm_1 = df_harm[df_harm['Ligand_id'].isin(lista_ligandos)]
print(df_harm_1.shape[0])

83805
47307


In [None]:
#@title Identificar ligandos presentes en más familias de proteínas

df_counts = df_harm_1['Ligand_id'].value_counts().reset_index()
df_counts.columns = ['Ligand_id', 'Count']
print(df_counts)

     Ligand_id  Count
0          MPD   1663
1          GDP   1107
2          SAH   1036
3          ANP    766
4          AMP    666
...        ...    ...
3358       RJW      2
3359       G93      2
3360       77X      2
3361       P6U      2
3362       K1A      2

[3363 rows x 2 columns]


In [None]:
#@title Identificar ligandos presentes en más familias de proteínas

df_counts_diff = df_grupos_filtrado['Ligand_id'].value_counts().reset_index()
df_counts_diff.columns = ['Ligand_id', 'Count']
print(df_counts_diff)

     Ligand_id  Count
0          MPD     82
1          IPA     68
2          GDP     64
3          MRD     54
4          ANP     50
...        ...    ...
3358       6OT      2
3359       WN4      2
3360       AXG      2
3361       H0D      2
3362       NVY      2

[3363 rows x 2 columns]


In [None]:
#@title Uniformar nombre de las columnas

df_counts = df_counts.rename(columns={'Count': 'Count_general'})
df_counts_diff = df_counts_diff.rename(columns={'Count': 'Count_diff'})

In [None]:
df_grupos_filtrado = df_grupos_filtrado.merge(df_counts, on='Ligand_id', how='left')
df_grupos_filtrado = df_grupos_filtrado.merge(df_counts_diff, on='Ligand_id', how='left')

In [None]:
!pip install pandarallel

Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill>=0.3.1 (from pandarallel)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.5-py3-none-any.whl size=16674 sha256=bda6243dad6cc66e5d836dcbca2b8eb52a27e5f8dcc743044acaee3b91a7e9a9
  Stored in directory: /root/.cache/pip/wheels/b9/c6/5a/829298789e94348b81af52ab42c19d49da007306bbcc983827
Successfully built pandarallel
Installing collected packages: dill, pandarallel
Successfully installed dill-0.3.9 pandarallel-1.6.5


In [None]:
df_ligandos = df_grupos_filtrado.copy()

In [None]:
#@title Añadir nombre del ligando

import requests
def add_ligand_name(row):
  ligand = row['Ligand_id']


  query = """
  query molecule ($id: String!) {
      chem_comp(comp_id:$id){
          chem_comp {
              id
              name
          }
        }
      }
  """


  url = 'https://data.rcsb.org/graphql'
  payload = {"query": query, "variables": {"id": ligand}}
  response = requests.post(url, json=payload)
  if response.status_code == 200:
    data = response.json()
    name =data['data']['chem_comp']['chem_comp']['name']
    return name


In [None]:
from pandarallel import pandarallel


pandarallel.initialize(nb_workers=2, progress_bar=True)
df_ligandos['Ligand_name'] = df_ligandos.parallel_apply(add_ligand_name, axis = 1)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6201), Label(value='0 / 6201'))), …

In [None]:
import os
output_path = "/content/drive/MyDrive/TMF/T1/GRUPOS_LIGANDOS_FAMILIAS"
output_file = os.path.join(output_path, "df_ligandos_familia.csv")
df_ligandos.to_csv(output_file, sep = ',', index= False)

In [None]:
unique_id = set(df_ligandos['Ligand_id'])
print(len(unique_id))

3363
