# SMILES → Tanimoto score computed on all fingerprints used in MolForge
Calcula el Tanimoto coefficient entre dos columnes de **SMILES** fent servir les 15 **fingerprints** utilitzades al paper de MolForge

**Entrada**: CSV amb columnes `SMILES_input` i `SMILES_output_ECFP4`.

**Sortida**: CSV amb columnes `MACCS`, `Avalon`, `RDK4`, `RDK4-L`, `HashAP`, `TT`, `HashTT`, `ECFP0`, `ECFP2`, `ECFP4`, `FCFP2`, `FCFP4`, `AEs`, `ECFP2*`, `ECFP4*`.

## Imports

In [1]:
# Per definir l'arrel del projecte
import os

# Pandas pels dataframes
import pandas as pd
import numpy as np # pels NaN

# Per no mostrar els Warnings de RDKit
from rdkit import RDLogger
from rdkit import DataStructs
RDLogger.DisableLog("rdApp.*")

## Inputs (part a editar)

Arrel del projecte

In [2]:
os.chdir("/export/home/ddiestre/MolForge_Testing")

In [3]:
# Importem la funció smiles_to_fingerprint del nostre sourcecode que utilitza RDKit
from src.smiles_to_fp import smiles_to_fingerprint, get_supported_fingerprints
from src.fingerprints import FpSimilarity

Fingerprints en que transformar els SMILES

In [4]:
fp_type = "ECFP4"

Fitxer de fingerprints preprocessat (path a partir de MolForge_Testing/)

In [5]:
#input_path = "data/MolForge_output/MolForge_MFoutput_2000_ECFP4.csv"
#input_path = "data/MolForge_output/CoCoGraph_MFoutput_2000_novel.csv"
input_path = "data/MolForge_output/CoCoGraph_MFoutput_2000_lt70atoms.csv"

SMILES_in_col_name = "SMILES_input"
SMILES_out_col_name = "SMILES_output_" + fp_type

Fitxer en que guardar l'output (path a partir de MolForge_Testing/)

In [6]:
# Output_path ha de ser una carpeta
#output_path = "data/analysis_output/MolForge_MF_Analysis_2000_ECFP4/"
#output_path = "data/analysis_output/CoCoGraph_MF_Analysis_2000_novel/"
output_path = "data/analysis_output/CoCoGraph_MF_Analysis_2000_lt70atoms/"

# Creem la carpeta de output_path
os.makedirs(output_path, exist_ok=True)

# Columna on guardarem l'output de MolForge en format fp_type (per visualitzar els resultats en format bitvectors)
fp_out_col_name = "fingerprints_output_" + fp_type

## 1. Lectura del fitxer

In [7]:
# Lectura del fitxer
df = pd.read_csv(input_path, sep = ',', index_col = 0)
df.head(5)

Unnamed: 0_level_0,SMILES_input,fingerprints_input_ECFP4,SMILES_output_ECFP4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,O=C(NCCCCc1nc(-c2ccccn2)cs1)[C@@H]1COCCO1,2 80 193 197 221 282 378 422 448 523 552 592 6...,C1COC(CO1)C(=O)NCCCCC2=NC(=CS2)C3=CC=CC=N3
2,O=C(NCCCCn1cnc([N+](=O)[O-])n1)c1ccc(OC(F)(F)F...,8 80 114 145 197 207 265 317 322 378 486 548 5...,C1=CC(=CC=C1C(=O)NCCCCN2C=NC(=N2)[N+](=O)[O-])...
3,O=C(O)C(=O)CC1CCCCC1,2 29 80 223 389 446 484 578 650 715 807 890 92...,C1CCC(CC1)CC(=O)C(=O)CC(=O)C(=O)CC(=O)C(=O)O
4,CC(C)c1ccc([N+](=O)[O-])c(C(C)C)c1N1C(=O)c2c(F...,1 121 146 237 283 314 354 360 437 526 598 633 ...,CC(C)C1=C(C(=C(C=C1)[N+](=O)[O-])C(C)C)N2C(=O)...
5,CC(=O)N1CCc2cc(S(=O)(=O)CCC(=O)N3CCN(c4cccc(C)...,80 231 252 319 350 407 432 481 582 626 650 715...,CC1=C(C(=CC=C1)N2CCN(CC2)C(=O)CCS(=O)(=O)C3=CC...


## 2. Columna SMILES_output → Columna de fingerprints

Aquesta part del codi ens serveix únicament per visualitzar els resultats de MolForge en forma de fingerprints i per identificar les al·lucinacions.

In [8]:
fingerprints = []

# Apliquem el conversor SMILES -> fingerprint
fingerprints = df[SMILES_out_col_name].apply(
    lambda s: smiles_to_fingerprint(s, fp_type=fp_type, n_bits=2048, return_bits=True)
)

# Guardem les fingerprints en el mateix format que l'input de MolForge
df[fp_out_col_name] = fingerprints.apply(
    lambda lst: " ".join(str(x) for x in lst) if isinstance(lst, list) else lst
)

# Visusalitzem el nou dataframe
df.head(5)

Unnamed: 0_level_0,SMILES_input,fingerprints_input_ECFP4,SMILES_output_ECFP4,fingerprints_output_ECFP4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,O=C(NCCCCc1nc(-c2ccccn2)cs1)[C@@H]1COCCO1,2 80 193 197 221 282 378 422 448 523 552 592 6...,C1COC(CO1)C(=O)NCCCCC2=NC(=CS2)C3=CC=CC=N3,2 80 193 197 221 282 378 422 448 523 552 592 6...
2,O=C(NCCCCn1cnc([N+](=O)[O-])n1)c1ccc(OC(F)(F)F...,8 80 114 145 197 207 265 317 322 378 486 548 5...,C1=CC(=CC=C1C(=O)NCCCCN2C=NC(=N2)[N+](=O)[O-])...,8 80 114 145 197 207 265 317 322 378 486 548 5...
3,O=C(O)C(=O)CC1CCCCC1,2 29 80 223 389 446 484 578 650 715 807 890 92...,C1CCC(CC1)CC(=O)C(=O)CC(=O)C(=O)CC(=O)C(=O)O,2 29 59 80 223 389 446 484 550 578 650 715 807...
4,CC(C)c1ccc([N+](=O)[O-])c(C(C)C)c1N1C(=O)c2c(F...,1 121 146 237 283 314 354 360 437 526 598 633 ...,CC(C)C1=C(C(=C(C=C1)[N+](=O)[O-])C(C)C)N2C(=O)...,1 121 146 237 283 314 354 360 437 526 598 633 ...
5,CC(=O)N1CCc2cc(S(=O)(=O)CCC(=O)N3CCN(c4cccc(C)...,80 231 252 319 350 407 432 481 582 626 650 715...,CC1=C(C(=CC=C1)N2CCN(CC2)C(=O)CCS(=O)(=O)C3=CC...,80 231 252 319 350 407 432 481 582 626 650 715...


## 3. Càlcul del Tanimoto coefficient entre cada parell de SMILES per tots els fingerprints

In [9]:
for col in get_supported_fingerprints():
    
    fingerprints_input = []
    # Apliquem el conversor SMILES -> fingerprint
    fingerprints_input = df[SMILES_in_col_name].apply(
        lambda s: smiles_to_fingerprint(s, fp_type=col, n_bits=2048, return_bits=False)
    )

    fingerprints_output = []
    # Apliquem el conversor SMILES -> fingerprint
    fingerprints_output = df[SMILES_out_col_name].apply(
        lambda s: smiles_to_fingerprint(s, fp_type=col, n_bits=2048, return_bits=False)
    )

    # Considerem un SMILES Invàlid com que té Tanimoto 0
    tanimotos_Inv_0 = [
        DataStructs.TanimotoSimilarity(fp_in, fp_out)*100 # multipliquem per 100 per tenir-ho en percentatge
        # guardem 0 en els casos on l'output no és correcte i NaN si l'input és NaN
        if (not pd.isna(fp_out) and fp_out != "InvalidSMILE")
        else (0 if fp_out == "InvalidSMILE" else np.nan)
        for fp_in, fp_out in zip(fingerprints_input, fingerprints_output)
    ]

    # No Considerem els SMILES Invàlids
    tanimotos_Inv_NaN = [
        DataStructs.TanimotoSimilarity(fp_in, fp_out)*100 # multipliquem per 100 per tenir-ho en percentatge
        # guardem NaN en els casos on l'output no és correcte i NaN si l'input és NaN
        if (not pd.isna(fp_out) and fp_out != "InvalidSMILE")
        else (np.nan if fp_out == "InvalidSMILE" else np.nan)
        for fp_in, fp_out in zip(fingerprints_input, fingerprints_output)
    ]

    df[col+"_Inv_0"] = tanimotos_Inv_0
    df[col+"_Inv_NaN"] = tanimotos_Inv_NaN

Guardem el dataframe df complet amb els Tanimotos de cada parell de SMILES

In [10]:
df.to_csv(output_path+"Individual_Tanimotos.csv")
df.head(5)

Unnamed: 0_level_0,SMILES_input,fingerprints_input_ECFP4,SMILES_output_ECFP4,fingerprints_output_ECFP4,MACCS_Inv_0,MACCS_Inv_NaN,Avalon_Inv_0,Avalon_Inv_NaN,RDK4_Inv_0,RDK4_Inv_NaN,...,FCFP2_Inv_0,FCFP2_Inv_NaN,FCFP4_Inv_0,FCFP4_Inv_NaN,AEs_Inv_0,AEs_Inv_NaN,ECFP2*_Inv_0,ECFP2*_Inv_NaN,ECFP4*_Inv_0,ECFP4*_Inv_NaN
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,O=C(NCCCCc1nc(-c2ccccn2)cs1)[C@@H]1COCCO1,2 80 193 197 221 282 378 422 448 523 552 592 6...,C1COC(CO1)C(=O)NCCCCC2=NC(=CS2)C3=CC=CC=N3,2 80 193 197 221 282 378 422 448 523 552 592 6...,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2,O=C(NCCCCn1cnc([N+](=O)[O-])n1)c1ccc(OC(F)(F)F...,8 80 114 145 197 207 265 317 322 378 486 548 5...,C1=CC(=CC=C1C(=O)NCCCCN2C=NC(=N2)[N+](=O)[O-])...,8 80 114 145 197 207 265 317 322 378 486 548 5...,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
3,O=C(O)C(=O)CC1CCCCC1,2 29 80 223 389 446 484 578 650 715 807 890 92...,C1CCC(CC1)CC(=O)C(=O)CC(=O)C(=O)CC(=O)C(=O)O,2 29 59 80 223 389 446 484 550 578 650 715 807...,90.47619,90.47619,50.724638,50.724638,95.454545,95.454545,...,54.545455,54.545455,53.333333,53.333333,54.545455,54.545455,92.857143,92.857143,76.0,76.0
4,CC(C)c1ccc([N+](=O)[O-])c(C(C)C)c1N1C(=O)c2c(F...,1 121 146 237 283 314 354 360 437 526 598 633 ...,CC(C)C1=C(C(=C(C=C1)[N+](=O)[O-])C(C)C)N2C(=O)...,1 121 146 237 283 314 354 360 437 526 598 633 ...,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
5,CC(=O)N1CCc2cc(S(=O)(=O)CCC(=O)N3CCN(c4cccc(C)...,80 231 252 319 350 407 432 481 582 626 650 715...,CC1=C(C(=CC=C1)N2CCN(CC2)C(=O)CCS(=O)(=O)C3=CC...,80 231 252 319 350 407 432 481 582 626 650 715...,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


## 4. Tanimoto promig per Fingerprint emprada per calcular-lo (columnes)

In [11]:
avg_tanimotos_Inv_0 = {col: df[col+"_Inv_0"].mean() for col in get_supported_fingerprints()}
Tanimoto_Inv_0_df = pd.DataFrame([avg_tanimotos_Inv_0], index=[fp_type])

Tanimoto_Inv_0_df

Unnamed: 0,MACCS,Avalon,RDK4,RDK4-L,HashAP,TT,HashTT,ECFP0,ECFP2,ECFP4,FCFP2,FCFP4,AEs,ECFP2*,ECFP4*
ECFP4,98.45479,97.376657,97.722952,97.826797,94.99828,95.417063,95.419267,96.639309,96.283949,95.505415,96.449842,95.865243,96.279684,98.434589,97.024493


In [12]:
avg_tanimotos_Inv_NaN = {col: df[col+"_Inv_NaN"].mean() for col in get_supported_fingerprints()}
Tanimoto_Inv_NaN_df = pd.DataFrame([avg_tanimotos_Inv_NaN], index=[fp_type])

Tanimoto_Inv_NaN_df

Unnamed: 0,MACCS,Avalon,RDK4,RDK4-L,HashAP,TT,HashTT,ECFP0,ECFP2,ECFP4,FCFP2,FCFP4,AEs,ECFP2*,ECFP4*
ECFP4,98.999286,97.915191,98.263401,98.36782,95.52366,95.944759,95.946975,97.173765,96.816439,96.0336,96.98325,96.395418,96.81215,98.978974,97.561079


## 5. Càlcul d'altres paràmetres interessants

### Average tanimoto score

In [13]:
Tanimoto_Inv_0_df["Avg_Tc"] = Tanimoto_Inv_0_df.mean(axis=1)
Tanimoto_Inv_NaN_df["Avg_Tc"] = Tanimoto_Inv_NaN_df.mean(axis=1)

### Percentatge d'al·lucinacions de MolForge

In [14]:
Tanimoto_Inv_0_df["Invalid (%)"] = (df[fp_out_col_name] == "InvalidSMILE").mean() * 100
Tanimoto_Inv_NaN_df["Invalid (%)"] = (df[fp_out_col_name] == "InvalidSMILE").mean() * 100

### Percentatge de string-exacts

In [15]:
Tanimoto_Inv_0_df["String-exacts (%)"] = (Tanimoto_Inv_0_df["Avg_Tc"] == 100).mean() * 100
Tanimoto_Inv_NaN_df["String-exacts (%)"] = (Tanimoto_Inv_NaN_df["Avg_Tc"] == 100).mean() * 100

## 6. Guardar el dataframe dels resultats de l'anàlisi de la Tanimoto score

In [16]:
Tanimoto_Inv_0_df.to_csv(output_path+"Tanimoto_Inv_0.csv")
Tanimoto_Inv_0_df

Unnamed: 0,MACCS,Avalon,RDK4,RDK4-L,HashAP,TT,HashTT,ECFP0,ECFP2,ECFP4,FCFP2,FCFP4,AEs,ECFP2*,ECFP4*,Avg_Tc,Invalid (%),String-exacts (%)
ECFP4,98.45479,97.376657,97.722952,97.826797,94.99828,95.417063,95.419267,96.639309,96.283949,95.505415,96.449842,95.865243,96.279684,98.434589,97.024493,96.646555,0.55,0.0


In [17]:
Tanimoto_Inv_NaN_df.to_csv(output_path+"Tanimoto_Inv_NaN.csv")
Tanimoto_Inv_NaN_df

Unnamed: 0,MACCS,Avalon,RDK4,RDK4-L,HashAP,TT,HashTT,ECFP0,ECFP2,ECFP4,FCFP2,FCFP4,AEs,ECFP2*,ECFP4*,Avg_Tc,Invalid (%),String-exacts (%)
ECFP4,98.999286,97.915191,98.263401,98.36782,95.52366,95.944759,95.946975,97.173765,96.816439,96.0336,96.98325,96.395418,96.81215,98.978974,97.561079,97.181051,0.55,0.0
