# Classyfire

Voy a utilizar la API **Classyfire** para obtener la taxonomia de los ligandos en el dataframe.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Instalar librerías

!pip install rdkit
!pip install Biopython
!pip install pandarallel

In [None]:
#@title Abrir el dataframe

import os
import pandas as pd

input_folder = "/content/drive/MyDrive/TFM/T2/Comprobar_entorno_proteico_sitios"
input_file = os.path.join(input_folder, "df_final_filtro.csv")
df_harm = pd.read_csv(input_file, sep = ',')
print(df_harm.shape[0],df_harm.columns)

70658 Index(['index', 'PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id',
       'Ligand_id', 'Ligand_InChi', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas', 'Ligand_smiles',
       'Mol_Weight', 'n_atoms', 'Nombre', 'Classification_y', 'Count_general',
       'Count_diff'],
      dtype='object')


In [None]:
#@title Obtener lista SMILES

inchi_key_tot = df_harm['Ligand_InChi']
inchi_key_tot_list = list(set(inchi_key_tot))
print(len(inchi_key_tot_list))

2874


In [None]:
inchi_key_tot_list

In [None]:
inchi_key_tot_list = "/content/drive/MyDrive/TFM/T2/classyfire/lista_icnhi_key.txt"

with open(inchi_key_tot_list, "w", encoding= "utf-8") as f:
  for inchi in inchi_key_tot_list:
    f.write(inchi + "\n")

In [None]:
#@title Definir función para calcular el código InChI

from rdkit import Chem
from rdkit.Chem import inchi

def smiles_to_inchi(row):
    try:
        mol = Chem.MolFromSmiles(row['Ligand_smiles'])
        if mol:
            return inchi.MolToInchi(mol)
        else:
            return None
    except:
        return None


In [None]:
#@title Renombrar columna InChI con InChI_key

df_harm = df_harm.rename(columns={'Ligand_InChi':'Ligand_InChI_key'})

In [None]:
df_harm.columns

Index(['PDB_entry_id', 'Classification_x', 'Organism', 'Uniprot_id',
       'Ligand_id', 'Ligand_InChI_key', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas', 'Ligand_smiles',
       'Mol_Weight', 'n_atoms', 'Nombre', 'Classification_y', 'Count_general',
       'Count_diff'],
      dtype='object')

In [None]:
#@title Aplicar la función al dataframe

from pandarallel import pandarallel

pandarallel.initialize(nb_workers= 2, progress_bar= True)

df_harm['Ligand_InChI'] = df_harm.parallel_apply(smiles_to_inchi, axis = 1)

In [None]:
list_inchi = df_harm['Ligand_InChI'].tolist()

In [None]:
#@title Identificar columnas vacías

if df_harm['Ligand_InChI'].isnull().any():
    print("Hay valores vacíos en la columna")
else:
    print("No hay valores vacíos en la columna")

num_vacios = df_harm['Ligand_InChI'].isnull().sum()
print(f"Hay {num_vacios} valores vacíos en la columna.")

Hay valores vacíos en la columna


In [None]:
filas_vacias = df_harm[df_harm['Ligand_InChI'].isnull()]
print(filas_vacias)


In [None]:
df_harm_copy = df_harm.copy()

In [None]:
#@title Arreglo manual del InChI faltante

import pandas as pd

# Asegurarnos de que no hay espacios extra
df_harm['Ligand_id'] = df_harm['Ligand_id'].str.strip()
df_harm['Ligand_InChI'] = df_harm['Ligand_InChI'].str.strip()

# Detectar InChI vacíos ('' o NaN)
is_inchi_empty = df_harm['Ligand_InChI'].isnull() | (df_harm['Ligand_InChI'] == '')

# Crear máscara precisa
mask = (df_harm['Ligand_id'] == 'FC6') & (is_inchi_empty)

# Solo cambiar esas filas
df_harm.loc[mask, 'Ligand_InChI'] = 'InChI=1S/6CN.Fe/c6*1-2;'



In [None]:
#@title Comprobar si hay valores vacíos

if df_harm['Ligand_InChI'].isnull().any():
    print("Hay valores vacíos en la columna")
else:
    print("No hay valores vacíos en la columna")

No hay valores vacíos en la columna


In [None]:
df_output_path = "/content/drive/MyDrive/TFM/T2/classyfire/df_binana_inchi.csv"
df_harm.to_csv(df_output_path, sep = ',', index = True)

In [None]:
#@title Obtener lista inchi

inchi_tot = df_harm_copy['Ligand_InChi'].tolist()
inchi_tot_list = list(set(inchi_tot))
print(len(inchi_tot_list))

In [None]:
#@title Abrir el dataframe

import os
import pandas as pd

input_folder = "/content/drive/MyDrive/TFM/T2/classyfire"
input_file = os.path.join(input_folder, "df_binana_inchi.csv")
df_harm = pd.read_csv(input_file, sep = ',')
print(df_harm.shape[0],df_harm.columns)

71001 Index(['Unnamed: 0', 'PDB_entry_id', 'Classification_x', 'Organism',
       'Uniprot_id', 'Ligand_id', 'Ligand_InChI_key', 'Experimental_method',
       'Resolution', 'Adding_Classification', 'Affinity', 'Coordenadas',
       'Ligand_smiles', 'Mol_Weight', 'n_atoms', 'Nombre', 'Classification_y',
       'Count_general', 'Count_diff', 'Ligand_InChI'],
      dtype='object')


In [None]:
inchikey = df_harm['Ligand_InChI_key'].tolist()

In [None]:
smiles = df_harm['Ligand_smiles'].tolist()

In [None]:
#smiles

In [None]:
!git clone https://github.com/JamesJeffryes/pyclassyfire.git
%cd pyclassyfire
!pip install .


In [None]:
#@title Intentar el código con un ejemplo

from pyclassyfire import client
import time
import json

# Define el SMILES
smiles = "CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=O)N2[C@H]1C(O)=O"

# Envía la consulta
query_id = client.structure_query(smiles, label="ejemplo_smiles")

# Espera a que la clasificación esté lista
while True:
    result = client.get_results(query_id)
    result_json = json.loads(result)
    if result_json.get("classification_status") == "Done":
        break
    time.sleep(5)

# Muestra la clasificación
print(json.dumps(result_json, indent=2))


In [None]:
entity = result_json['entities'][0]

kingdom    = entity.get('kingdom', {}).get('name', 'NA')
superclass = entity.get('superclass', {}).get('name', 'NA')
_class     = entity.get('class', {}).get('name', 'NA')
subclass   = entity.get('subclass', {}).get('name', 'NA')

print("Kingdom:", kingdom)
print("Superclass:", superclass)
print("Class:", _class)
print("Subclass:", subclass)


Kingdom: Organic compounds
Superclass: Organoheterocyclic compounds
Class: Lactams
Subclass: Beta lactams


In [None]:
#@title Definir el dataframe con los ligandos únicos para realizar la clasificación

df = df_harm[['Ligand_id','Ligand_smiles']]
df = df.drop_duplicates(subset='Ligand_smiles').reset_index(drop=True)
df.shape[0]

In [None]:
#@title Aplicar Classyfire sobre el dataframe. Primer intento sobre los SMILES

# Importar librerías
from pyclassyfire import client
import pandas as pd
import time
import json
import os
import requests

# === Configuración ===
output_dir = '/content/drive/MyDrive/TFM/T2/classyfire/por_bloques'
os.makedirs(output_dir, exist_ok=True)

BLOCK_SIZE = 50  # más pequeño para reducir riesgo de bloqueo
WAIT_BETWEEN_QUERIES = 6  # segundos entre compuestos
WAIT_ON_429 = 90          # esperar si hay 429
WAIT_BETWEEN_BLOCKS = 90  # segundos entre bloques

# Cargar SMILES únicos
df_all = df[['Ligand_smiles']].drop_duplicates().reset_index(drop=True)
total = len(df_all)

# Control de errores
errores = []

# Clasificar por bloques
start_idx = 0
block_num = 1

while start_idx < total:
    df_block = df_all.iloc[start_idx:start_idx+BLOCK_SIZE].copy()
    df_block['Kingdom'] = pd.NA
    df_block['Superclass'] = pd.NA
    df_block['Class'] = pd.NA
    df_block['Subclass'] = pd.NA

    for i, row in df_block.iterrows():
        smiles = row['Ligand_smiles']
        print(f"[Block {block_num}] Clasificando: {smiles}")

        # Enviar consulta con control de errores
        try:
            query_id = client.structure_query(smiles, label=f"ligando_{start_idx + i}")
            time.sleep(WAIT_BETWEEN_QUERIES)
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                print("⏸️ Esperando por límite de solicitudes (429)...")
                time.sleep(WAIT_ON_429)
                errores.append(smiles)
                continue
            else:
                print(f"❌ Otro error HTTP: {e}")
                errores.append(smiles)
                continue
        except Exception as e:
            print(f"❌ Error general con {smiles}: {e}")
            errores.append(smiles)
            continue

        # Esperar clasificación con validación
        success = False
        for _ in range(30):  # intenta durante ~1 minuto
            try:
                result = client.get_results(query_id)
                result_json = json.loads(result)
                if result_json.get("classification_status") == "Done":
                    success = True
                    break
                else:
                    print(f"⏳ Esperando: {result_json.get('classification_status')}")
            except Exception as e:
                print(f"⚠️ Error al obtener resultado: {e}")
            time.sleep(2)

        if not success:
            print(f"❌ No se obtuvo clasificación para: {smiles}")
            errores.append(smiles)
            continue

        # Extraer resultados desde 'entities' con validación de 'None'
        if 'entities' in result_json and len(result_json['entities']) > 0:
            entity = result_json['entities'][0]
            df_block.at[i, 'Kingdom']    = entity.get('kingdom', {}).get('name', 'NA') if entity.get('kingdom') else 'NA'
            df_block.at[i, 'Superclass'] = entity.get('superclass', {}).get('name', 'NA') if entity.get('superclass') else 'NA'
            df_block.at[i, 'Class']      = entity.get('class', {}).get('name', 'NA') if entity.get('class') else 'NA'
            df_block.at[i, 'Subclass']   = entity.get('subclass', {}).get('name', 'NA') if entity.get('subclass') else 'NA'
        else:
            print(f"⚠️ No se encontró información en 'entities' para: {smiles}")
            errores.append(smiles)
            continue

    # Guardar bloque
    out_file = os.path.join(output_dir, f'classyfire_block_{block_num}.csv')
    df_block.to_csv(out_file, index=False)
    print(f"💾 Guardado: {out_file}")

    start_idx += BLOCK_SIZE
    block_num += 1

    # Esperar entre bloques
    print(f"🛑 Esperando {WAIT_BETWEEN_BLOCKS} segundos antes del siguiente bloque...")
    time.sleep(WAIT_BETWEEN_BLOCKS)

# Guardar errores
if errores:
    with open(os.path.join(output_dir, "errores_smiles.txt"), 'w') as f:
        for s in errores:
            f.write(s + "\n")
    print(f"❗ Se guardaron {len(errores)} SMILES fallidos para revisar o reintentar luego.")
else:
    print("✅ Todos los SMILES fueron procesados sin errores 429 permanentes.")

In [None]:
#@title Aplicar Classyfire sobre el dataframe. Intento sobre los SMILES, saltar los ya calculados

from pyclassyfire import client
import pandas as pd
import time
import json
import os
import requests
from tqdm.notebook import tqdm


# === Configuración ===
output_dir = '/content/drive/MyDrive/TFM/T2/classyfire/por_bloques'
os.makedirs(output_dir, exist_ok=True)

BLOCK_SIZE = 50
WAIT_BETWEEN_QUERIES = 6
WAIT_ON_429 = 90
WAIT_BETWEEN_BLOCKS = 90

# Cargar SMILES únicos
df_all = df[['Ligand_smiles']].drop_duplicates().reset_index(drop=True)
total = len(df_all)

# Control de errores y progreso
errores = []
progresados = []

# Clasificar por bloques
start_idx = 0
block_num = 1

# Progreso visual con tqdm
with tqdm(total=total, desc="🔄 Progreso total", unit="molécula") as pbar:
    while start_idx < total:
        out_file = os.path.join(output_dir, f'classyfire_block_{block_num}.csv')

        if os.path.exists(out_file):
            print(f"✅ Bloque {block_num} ya existe. Saltando...")
            start_idx += BLOCK_SIZE
            block_num += 1
            pbar.update(min(BLOCK_SIZE, total - pbar.n))
            continue

        df_block = df_all.iloc[start_idx:start_idx+BLOCK_SIZE].copy()
        df_block['Kingdom'] = pd.NA
        df_block['Superclass'] = pd.NA
        df_block['Class'] = pd.NA
        df_block['Subclass'] = pd.NA

        for i, row in df_block.iterrows():
            smiles = row['Ligand_smiles']
            print(f"[Block {block_num}] Clasificando: {smiles}")

            try:
                query_id = client.structure_query(smiles, label=f"ligando_{start_idx + i}")
                time.sleep(WAIT_BETWEEN_QUERIES)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    print("⏸️ Esperando por límite de solicitudes (429)...")
                    time.sleep(WAIT_ON_429)
                    errores.append(smiles)
                    continue
                else:
                    print(f"❌ Otro error HTTP: {e}")
                    errores.append(smiles)
                    continue
            except Exception as e:
                print(f"❌ Error general con {smiles}: {e}")
                errores.append(smiles)
                continue

            # Esperar resultado
            success = False
            for _ in range(30):
                try:
                    result = client.get_results(query_id)
                    result_json = json.loads(result)
                    if result_json.get("classification_status") == "Done":
                        success = True
                        break
                    else:
                        print(f"⏳ Esperando: {result_json.get('classification_status')}")
                except Exception as e:
                    print(f"⚠️ Error al obtener resultado: {e}")
                time.sleep(2)

            if not success:
                print(f"❌ No se obtuvo clasificación para: {smiles}")
                errores.append(smiles)
                continue

            if 'entities' in result_json and len(result_json['entities']) > 0:
                entity = result_json['entities'][0]
                df_block.at[i, 'Kingdom']    = entity.get('kingdom', {}).get('name', 'NA') if entity.get('kingdom') else 'NA'
                df_block.at[i, 'Superclass'] = entity.get('superclass', {}).get('name', 'NA') if entity.get('superclass') else 'NA'
                df_block.at[i, 'Class']      = entity.get('class', {}).get('name', 'NA') if entity.get('class') else 'NA'
                df_block.at[i, 'Subclass']   = entity.get('subclass', {}).get('name', 'NA') if entity.get('subclass') else 'NA'
                progresados.append(start_idx + i)
                pbar.update(1)
            else:
                print(f"⚠️ No se encontró información en 'entities' para: {smiles}")
                errores.append(smiles)
                continue

        df_block.to_csv(out_file, index=False)
        print(f"💾 Guardado: {out_file}")

        start_idx += BLOCK_SIZE
        block_num += 1

        print(f"🛑 Esperando {WAIT_BETWEEN_BLOCKS} segundos antes del siguiente bloque...")
        time.sleep(WAIT_BETWEEN_BLOCKS)

# Guardar errores
if errores:
    with open(os.path.join(output_dir, "errores_smiles.txt"), 'w') as f:
        for s in errores:
            f.write(s + "\n")
    print(f"❗ Se guardaron {len(errores)} SMILES fallidos.")
else:
    print("✅ Todos los SMILES fueron procesados correctamente.")

In [None]:
#@title Unir los bloques

import os
import pandas as pd

input_dir = '/content/drive/MyDrive/TFM/T2/classyfire/por_bloques'
output_file = '/content/drive/MyDrive/TFM/T2/classyfire/classyfire_unificado.csv'

dfs = []
for filename in sorted(os.listdir(input_dir)):
    if filename.endswith('.csv') and filename.startswith('classyfire_block_'):
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

df_total = pd.concat(dfs, ignore_index=True)

df_total.to_csv(output_file, index=False)


In [None]:
df_total = pd.read_csv('/content/drive/MyDrive/TFM/T2/classyfire/classyfire_unificado.csv', sep = ',')
df_total.shape[0]

2869

In [None]:
#@title Obtener lista SMILES

smiles_tot = df_harm['Ligand_smiles']
smiles_tot_list = list(set(smiles_tot))
print(len(smiles_tot_list))

2865


In [None]:
df_total = df_total[df_total['Ligand_smiles'].isin(smiles_tot_list)]
df_total.shape[0]

2865

In [None]:
list_1 =df_total['Superclass'].tolist()

In [None]:
unique = list(set(list_1))

In [None]:
unique = [i for i in unique if i]
print(unique)

[nan, 'Nucleosides, nucleotides, and analogues', 'Alkaloids and derivatives', 'Organic acids and derivatives', 'Organic 1,3-dipolar compounds', 'Organic nitrogen compounds', 'Organoheterocyclic compounds', 'Organosulfur compounds', 'Organic salts', 'Lignans, neolignans and related compounds', 'Benzenoids', 'Hydrocarbons', 'Hydrocarbon derivatives', 'Organophosphorus compounds', 'Organometallic compounds', 'Organohalogen compounds', 'Organic oxygen compounds', 'Phenylpropanoids and polyketides', 'Lipids and lipid-like molecules']


In [None]:
df_total.head()

Unnamed: 0,Ligand_smiles,Kingdom,Superclass,Class,Subclass
0,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Organic compounds,Organoheterocyclic compounds,Lactams,Beta lactams
1,NCc1c[nH]c2N=C(N)NC(=O)c12,Organic compounds,Organoheterocyclic compounds,Pyrrolopyrimidines,"Pyrrolo[2,3-d]pyrimidines"
2,O[C@H]1C[C@@H](O[C@@H]1CO[P](O)(=O)N[P](O)(=O)...,Organic compounds,"Nucleosides, nucleotides, and analogues",Pyrimidine nucleosides,Pyrimidine 2'-deoxyribonucleosides
3,Cc1cc2NC3=C(NC(=O)NC3=O)N(C[C@H](O)[C@H](O)[C@...,Organic compounds,"Nucleosides, nucleotides, and analogues",Flavin nucleotides,
4,NCCc1c[nH]c2ccccc12,Organic compounds,Organoheterocyclic compounds,Indoles and derivatives,Tryptamines and derivatives


In [None]:
df_total_1 = df_total.merge(df_harm[['Ligand_smiles', 'Ligand_id']], on="Ligand_smiles", how="left")


In [None]:
df_total_1.head()

Unnamed: 0,Ligand_smiles,Kingdom,Superclass,Class,Subclass,Ligand_id
0,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Organic compounds,Organoheterocyclic compounds,Lactams,Beta lactams,AIC
1,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Organic compounds,Organoheterocyclic compounds,Lactams,Beta lactams,AIC
2,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Organic compounds,Organoheterocyclic compounds,Lactams,Beta lactams,AIC
3,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Organic compounds,Organoheterocyclic compounds,Lactams,Beta lactams,AIC
4,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Organic compounds,Organoheterocyclic compounds,Lactams,Beta lactams,AIC


In [None]:
#@title Comprobar faltas

df_total['Kingdom'].isna().sum()

np.int64(238)

In [None]:
#@title Comprobar faltas

df_total['Superclass'].isna().sum()

np.int64(238)

In [None]:
df_class = df_total[['Ligand_smiles','Superclass','Class']]

In [None]:
df_class.head()

Unnamed: 0,Ligand_smiles,Superclass,Class
0,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,Organoheterocyclic compounds,Lactams
1,NCc1c[nH]c2N=C(N)NC(=O)c12,Organoheterocyclic compounds,Pyrrolopyrimidines
2,O[C@H]1C[C@@H](O[C@@H]1CO[P](O)(=O)N[P](O)(=O)...,"Nucleosides, nucleotides, and analogues",Pyrimidine nucleosides
3,Cc1cc2NC3=C(NC(=O)NC3=O)N(C[C@H](O)[C@H](O)[C@...,"Nucleosides, nucleotides, and analogues",Flavin nucleotides
4,NCCc1c[nH]c2ccccc12,Organoheterocyclic compounds,Indoles and derivatives


In [None]:
df_combinado = df_harm.merge(df_class, on="Ligand_smiles", how="left")

In [None]:
df_combinado.head()

In [None]:
df_combinado.columns

Index(['index', 'PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id',
       'Ligand_id', 'Ligand_InChI_key', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas', 'Ligand_smiles',
       'Mol_Weight', 'n_atoms', 'Nombre', 'Classification_y', 'Count_general',
       'Count_diff', 'Superclass', 'Class'],
      dtype='object')

In [None]:
df_combinado = df_combinado[['PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id',
       'Ligand_id', 'Ligand_InChI_key', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas', 'Ligand_smiles',
       'Mol_Weight', 'n_atoms', 'Nombre','Count_general',
       'Count_diff', 'Superclass', 'Class']]

In [None]:
df_combinado['Superclass'].value_counts()

Unnamed: 0_level_0,count
Superclass,Unnamed: 1_level_1
"Nucleosides, nucleotides, and analogues",16099
Lipids and lipid-like molecules,11896
Organoheterocyclic compounds,10700
Organic oxygen compounds,9792
Organic acids and derivatives,6557
Benzenoids,4797
Organic nitrogen compounds,2968
Hydrocarbons,1219
Phenylpropanoids and polyketides,1132
Organosulfur compounds,236


In [None]:
#@title Definir la función para la asignación de la clase

def Class_ligand_definition(row):
  Superclass = row['Superclass']
  Class = row['Class']
  if Superclass == "Lipids and lipid-like molecules":
    Superclass = Class
  return Superclass

In [None]:
#@title Aplicar la función al dataframe

from pandarallel import pandarallel

pandarallel.initialize(nb_workers= 2, progress_bar= True)

df_combinado['Ligand_Class'] = df_combinado.parallel_apply(Class_ligand_definition, axis = 1)

In [None]:
#@title Identificar si hay faltas

df_combinado['Superclass'].isna().sum()

np.int64(4987)

In [None]:
df_classyfire_comb = df_combinado[['PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id', 'Ligand_id',
       'Ligand_InChI_key', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas', 'Ligand_smiles',
       'Mol_Weight', 'n_atoms', 'Nombre', 'Count_general', 'Count_diff',
       'Ligand_Class']]

In [None]:
filas_vacias = df_classyfire_comb[df_classyfire_comb['Ligand_Class'].isna()]
print(f"Faltan clasificación para {filas_vacias.shape[0]} instancias")
ligand_vacios = filas_vacias['Ligand_id'].tolist()
ligand_vacios_unique = list(set(ligand_vacios))
print(f"Las clasificaciones faltantes corresponden a {len(ligand_vacios_unique} ligandos"))
filas_vacias_unique = filas_vacias.drop_duplicates(subset="Ligand_id").reset_index(drop=True)
print(filas_vacias_unique.shape[0])
filas_vacias_unique.head()

4987

Se intenta calcular la clasificación a partir del código InChI en lugar que del código SMILES

Primero hay que obtener el código InChI.

In [None]:
#@title Definir función para calcular el código InChI

from rdkit import Chem
from rdkit.Chem import inchi

def smiles_to_inchi(row):
    try:
        mol = Chem.MolFromSmiles(row['Ligand_smiles'])
        if mol:
            return inchi.MolToInchi(mol)
        else:
            return None
    except:
        return None

In [None]:
filas_vacias_unique['INCHI_CODE'] = filas_vacias_unique.parallel_apply(smiles_to_inchi, axis = 1)

In [None]:
filas_vacias_unique['INCHI_CODE'].isna().sum()

np.int64(1)

In [None]:
filas_vacias_unique.head()

Unnamed: 0,PDB_entry_id,Classification,Organism,Uniprot_id,Ligand_id,Ligand_InChI_key,Experimental_method,Resolution,Adding_Classification,Affinity,Coordenadas,Ligand_smiles,Mol_Weight,n_atoms,Nombre,Count_general,Count_diff,Ligand_Class,INCHI_CODE
0,4GHE,OXIDOREDUCTASE,Brevibacterium fuscum,Q45135,4NC,XJNPNXSISMKQEX-UHFFFAOYSA-N,X-RAY DIFFRACTION,1.6,,,"[100.0, 0, 'C', 403]",Oc1ccc(cc1O)[N+]([O-])=O,155.109,11.0,4GHE_4NC_C_403,27,4,,InChI=1S/C6H5NO4/c8-5-2-1-4(7(10)11)3-6(5)9/h1...
1,4GMD,TRANSFERASE,Pseudomonas aeruginosa PAO1,Q9HZN8,ATM,OIFWQOKDSPDILA-XLPZGREQSA-N,X-RAY DIFFRACTION,1.98,INHIBITOR,,"[94.46, 0, 'A', 301]",CC1=CN([C@H]2C[C@H](N=[N+]=[N-])[C@@H](CO[P](O...,347.224,23.0,4GMD_ATM_A_301,16,3,,InChI=1S/C10H14N5O7P/c1-5-3-15(10(17)12-9(5)16...
2,4GP4,OXIDOREDUCTASE,Thermus thermophilus HB8,Q5SJ79,HAS,PDYODZVCODUKFH-ZOMLSHGTSA-L,X-RAY DIFFRACTION,2.8,,,"[98.83, 0, 'A', 603]",Cc1c2n3c(c1C=C)C=C4C(=C(C5=CC6=[N]7[Fe]3(N45)[...,920.973,65.0,4GP4_HAS_A_603,13,2,,
3,4GPC,OXIDOREDUCTASE,Corynebacterium diphtheriae,Q54AI1,BLA,GWZYPXHJIZCRAJ-SRVCBVSDSA-N,X-RAY DIFFRACTION,1.85,,,"[83.6, 0, 'A', 901]",Cc1c([nH]c(/C=C/2N=C(\C=C\3NC(=O)C(=C3C=C)C)C(...,582.657,43.0,4GPC_BLA_A_901,88,10,,InChI=1S/C33H34N4O6/c1-7-20-19(6)32(42)37-27(2...
4,4GQC,OXIDOREDUCTASE,Aeropyrum pernix K1,Q9YA14,DTD,YPGMOWHXEQDBBV-IMJSIDKUSA-N,X-RAY DIFFRACTION,2.0,,,"[99.55, 0, 'C', 202]",O[C@H]1CSSC[C@@H]1O,152.24,8.0,4GQC_DTD_C_202,41,7,,"InChI=1S/C4H8O2S2/c5-3-1-7-8-2-4(3)6/h3-6H,1-2..."


In [None]:
#@title Aplicar Classyfire sobre el dataframe. Intento sobre los InChI, saltar los ya calculados

from pyclassyfire import client
import pandas as pd
import time
import json
import os
import requests
from tqdm.notebook import tqdm

# === Configuración ===
output_dir = '/content/drive/MyDrive/TFM/T2/classyfire/por_bloques_inchi'
os.makedirs(output_dir, exist_ok=True)

BLOCK_SIZE = 50
WAIT_BETWEEN_QUERIES = 6
WAIT_ON_429 = 90
WAIT_BETWEEN_BLOCKS = 90

# Cargar InChI únicos
df_all = filas_vacias_unique[['INCHI_CODE']].drop_duplicates().reset_index(drop=True)
total = len(df_all)

# Control de errores y progreso
errores = []
progresados = []

# Clasificar por bloques
start_idx = 0
block_num = 1

# Progreso visual con tqdm
with tqdm(total=total, desc="🔄 Progreso total (InChI)", unit="molécula") as pbar:
    while start_idx < total:
        out_file = os.path.join(output_dir, f'classyfire_block_inchi_{block_num}.csv')

        if os.path.exists(out_file):
            print(f"✅ Bloque {block_num} ya existe. Saltando...")
            start_idx += BLOCK_SIZE
            block_num += 1
            pbar.update(min(BLOCK_SIZE, total - pbar.n))
            continue

        df_block = df_all.iloc[start_idx:start_idx+BLOCK_SIZE].copy()
        df_block['Kingdom'] = pd.NA
        df_block['Superclass'] = pd.NA
        df_block['Class'] = pd.NA
        df_block['Subclass'] = pd.NA

        for i, row in df_block.iterrows():
            inchi = row['INCHI_CODE']
            print(f"[Block {block_num}] Clasificando InChI: {inchi}")

            try:
                query_id = client.structure_query(inchi, label=f"ligando_{start_idx + i}")
                time.sleep(WAIT_BETWEEN_QUERIES)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    print("⏸️ Esperando por límite de solicitudes (429)...")
                    time.sleep(WAIT_ON_429)
                    errores.append(inchi)
                    continue
                else:
                    print(f"❌ Otro error HTTP: {e}")
                    errores.append(inchi)
                    continue
            except Exception as e:
                print(f"❌ Error general con {inchi}: {e}")
                errores.append(inchi)
                continue

            # Esperar resultado
            success = False
            for _ in range(30):
                try:
                    result = client.get_results(query_id)
                    result_json = json.loads(result)
                    if result_json.get("classification_status") == "Done":
                        success = True
                        break
                    else:
                        print(f"⏳ Esperando: {result_json.get('classification_status')}")
                except Exception as e:
                    print(f"⚠️ Error al obtener resultado: {e}")
                time.sleep(2)

            if not success:
                print(f"❌ No se obtuvo clasificación para: {inchi}")
                errores.append(inchi)
                continue

            if 'entities' in result_json and len(result_json['entities']) > 0:
                entity = result_json['entities'][0]
                df_block.at[i, 'Kingdom']    = entity.get('kingdom', {}).get('name', 'NA') if entity.get('kingdom') else 'NA'
                df_block.at[i, 'Superclass'] = entity.get('superclass', {}).get('name', 'NA') if entity.get('superclass') else 'NA'
                df_block.at[i, 'Class']      = entity.get('class', {}).get('name', 'NA') if entity.get('class') else 'NA'
                df_block.at[i, 'Subclass']   = entity.get('subclass', {}).get('name', 'NA') if entity.get('subclass') else 'NA'
                progresados.append(start_idx + i)
                pbar.update(1)
            else:
                print(f"⚠️ No se encontró información en 'entities' para: {inchi}")
                errores.append(inchi)
                continue

        df_block.to_csv(out_file, index=False)
        print(f"💾 Guardado: {out_file}")

        start_idx += BLOCK_SIZE
        block_num += 1

        print(f"🛑 Esperando {WAIT_BETWEEN_BLOCKS} segundos antes del siguiente bloque...")
        time.sleep(WAIT_BETWEEN_BLOCKS)

# Guardar errores
if errores:
    with open(os.path.join(output_dir, "errores_inchi.txt"), 'w') as f:
        for s in errores:
            f.write(s + "\n")
    print(f"❗ Se guardaron {len(errores)} InChI fallidos.")
else:
    print("✅ Todos los InChI fueron procesados correctamente.")


In [None]:
#@title Unir los bloques

import os
import pandas as pd

input_dir = '/content/drive/MyDrive/TFM/T2/classyfire/por_bloques_inchi'
output_file = '/content/drive/MyDrive/TFM/T2/classyfire/por_bloques_inchi/classyfire_inchi_unificado.csv'

dfs = []
for filename in sorted(os.listdir(input_dir)):
    if filename.endswith('.csv') and filename.startswith('classyfire_block_'):
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

df_total_inchi = pd.concat(dfs, ignore_index=True)

df_total_inchi.to_csv(output_file, index=False)


In [None]:
df_total_inchi = pd.read_csv("/content/drive/MyDrive/TFM/T2/classyfire/por_bloques_inchi/classyfire_inchi_unificado.csv", sep = ',')

In [None]:
df_total_inchi.head()

Unnamed: 0,INCHI_CODE,Kingdom,Superclass,Class,Subclass
0,InChI=1S/C6H5NO4/c8-5-2-1-4(7(10)11)3-6(5)9/h1...,Organic compounds,Benzenoids,Phenols,Nitrophenols
1,InChI=1S/C10H14N5O7P/c1-5-3-15(10(17)12-9(5)16...,Organic compounds,Organoheterocyclic compounds,Diazines,Pyrimidines and pyrimidine derivatives
2,,,,,
3,InChI=1S/C33H34N4O6/c1-7-20-19(6)32(42)37-27(2...,Organic compounds,Organoheterocyclic compounds,Tetrapyrroles and derivatives,Bilirubins
4,"InChI=1S/C4H8O2S2/c5-3-1-7-8-2-4(3)6/h3-6H,1-2...",Organic compounds,Organoheterocyclic compounds,Dithianes,


In [None]:
#@title Unir los resultados

df_total_2 = df_total_inchi.merge(filas_vacias_unique[['INCHI_CODE', 'Ligand_id']], on="INCHI_CODE", how="left")
df_total_1 = df_total.merge(df_harm[['Ligand_smiles', 'Ligand_id']], on="Ligand_smiles", how="left")
df_total_3 = pd.concat([df_total_1,df_total_2], ignore_index= True)
df_total_3.head()

In [None]:
df_total_4 = df_total_3.drop_duplicates(subset="Ligand_id").reset_index(drop=True)
df_combinado_1 = df_harm.merge(df_total_4, on="Ligand_id", how="left")

In [None]:
from pandarallel import pandarallel

pandarallel.initialize(nb_workers= 2, progress_bar= True)

df_combinado_1['Ligand_Class'] = df_combinado_1.parallel_apply(Class_ligand_definition, axis = 1)