In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen

df = pd.read_csv('../Data/NRAS_ligands.csv')
print(df.columns)


Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource',
       'IC50 (nM)'],
      dtype='object')


In [9]:
features = []

for i, row in df.iterrows():
    estructura = row['Ligand SMILES']
    
    mol = Chem.MolFromSmiles(estructura)
    
    if mol is not None:
        # Amb les funcions RDKit extreim les dades 
        smiles = row['Ligand SMILES']
        formula = Chem.rdMolDescriptors.CalcMolFormula(mol)
        mw = Descriptors.ExactMolWt(mol)
        logp = Crippen.MolLogP(mol)
        num_hbd = Descriptors.NumHDonors(mol)
        num_hba = Descriptors.NumHAcceptors(mol)
        tpsa = Descriptors.TPSA(mol)
        num_rb = Descriptors.NumRotatableBonds(mol)
        ic50 = row['IC50 (nM)']
        
        # Agregam les dades a una llista
        features.append([smiles, formula, mw, logp, num_hbd, num_hba, tpsa, num_rb, ic50])

# cream un data frame amb les dades extretes
df_features = pd.DataFrame(features, columns=['SMILES','Formula', 'Peso Molecular', 'LogP', "H-bond donor","H-bond acceptor","TPSA","Rotatable bonds", "IC50"])

# Guardam el data fram en un nou csv
df_features.to_csv('../Data/NRAS_ligands_features.csv', index=False)

[19:21:24] Explicit valence for atom # 27 N, 4, is greater than permitted
[19:21:24] Explicit valence for atom # 27 N, 4, is greater than permitted
[19:21:25] Explicit valence for atom # 27 N, 4, is greater than permitted
[19:21:26] Explicit valence for atom # 27 N, 4, is greater than permitted


In [10]:
print(len(df_features))
print(df_features.head(5))

1831
                                              SMILES     Formula  \
0    COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cn[nH]c1   C20H21N7O   
1      COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cccnc1   C22H22N6O   
2      COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1ccncc1   C22H22N6O   
3  Cc1n[nH]c(C)c1-c1cn2c(cnc2cc1CO)-c1cccc(NC2CCN...   C22H25N7O   
4  COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cnn(CCN...  C26H32N8O2   

   Peso Molecular     LogP  H-bond donor  H-bond acceptor    TPSA  \
0      375.180758  2.56880             3                7   92.16   
1      386.185509  3.24070             2                7   76.37   
2      386.185509  3.24070             2                7   76.37   
3      403.212058  2.66934             4                7  103.16   
4      488.264822  2.37440             2               10   93.77   

   Rotatable bonds    IC50  
0                5   33170  
1                5    9920  
2                5    1830  
3                5   15720  
4                8   15720

In [15]:
#veure si tenc smiles que son iguals 
df_sin_duplicados_col1 = df_features.drop_duplicates(subset=['SMILES'])
print(len(df_sin_duplicados_col1))

#guardo les dades 
df_sin_duplicados_col1.to_csv('../Data/SMILES_NRAS_ligands_features.csv', index=False)

#guardo un amb les 50 primeres per fer proves clustering
df_50 = df_sin_duplicados_col1.head(50)
print(len(df_50))
df_50.to_csv('../Data/50_NRASligands.csv', index=False)


653
50


In [24]:
#cuantes molecules tenim?

total_mol = len(df_features['Formula'])
print(total_mol)

1831


In [25]:
#summary de les variables que he obtingut amb RDKit
 
summary = df_features.describe()
print(summary)
#NO FA RESUM DE IC50



       Peso Molecular         LogP  H-bond donor  H-bond acceptor  \
count     1831.000000  1831.000000   1831.000000      1831.000000   
mean       932.066646     5.862662      2.255052        11.759694   
std        101.814032     0.914444      0.504564         1.291051   
min        299.093773     0.510500      2.000000         5.000000   
25%        909.457116     5.450400      2.000000        11.000000   
50%        937.488416     5.920500      2.000000        12.000000   
75%        977.519717     6.389300      2.000000        13.000000   
max       1154.630408     8.069200      6.000000        16.000000   

              TPSA  Rotatable bonds  
count  1831.000000      1831.000000  
mean    168.357351         9.841617  
std      18.029557         1.385339  
min      54.250000         3.000000  
25%     160.460000         9.000000  
50%     171.540000        10.000000  
75%     177.950000        11.000000  
max     254.790000        14.000000  


In [26]:
nombre_columna = 'IC50'

# Verificar si la columna se considera numérica
try:
    pd.to_numeric(df_features[nombre_columna])
    es_numerica = True
except ValueError:
    es_numerica = False

# Imprimir el resultado
if es_numerica:
    print(f"La columna '{nombre_columna}' se considera numérica.")
else:
    print(f"La columna '{nombre_columna}' no se considera numérica.")

La columna 'IC50' no se considera numérica.


In [28]:
# Convertir la columna a tipo numérico
df_features[nombre_columna] = pd.to_numeric(df_features[nombre_columna], errors='coerce')

# Verificar si la conversión fue exitosa
if pd.api.types.is_numeric_dtype(df_features[nombre_columna]):
    print(f"La columna '{nombre_columna}' se ha convertido a tipo numérico.")
else:
    print(f"No se pudo convertir la columna '{nombre_columna}' a tipo numérico.")

# Guardar el DataFrame actualizado en un nuevo archivo CSV
df_features.to_csv('../Data/NRAS_ligands_features.csv', index=False)


La columna 'IC50' se ha convertido a tipo numérico.


In [29]:
#torno a fer el summary per comprovar que surt el IC50
 
summary = df_features.describe()
print(summary)

       Peso Molecular         LogP  H-bond donor  H-bond acceptor  \
count     1831.000000  1831.000000   1831.000000      1831.000000   
mean       932.066646     5.862662      2.255052        11.759694   
std        101.814032     0.914444      0.504564         1.291051   
min        299.093773     0.510500      2.000000         5.000000   
25%        909.457116     5.450400      2.000000        11.000000   
50%        937.488416     5.920500      2.000000        12.000000   
75%        977.519717     6.389300      2.000000        13.000000   
max       1154.630408     8.069200      6.000000        16.000000   

              TPSA  Rotatable bonds          IC50  
count  1831.000000      1831.000000   1443.000000  
mean    168.357351         9.841617   2138.860707  
std      18.029557         1.385339   3281.785689  
min      54.250000         3.000000      2.000000  
25%     160.460000         9.000000     55.000000  
50%     171.540000        10.000000    550.000000  
75%     177.95

In [79]:
#Hauriem de comprovar si les dades segueixen una distribució normal 
#prova test de normalitat 

import pandas as pd
from scipy import stats

MW = df_2['MW']

# Shapiro-Wilk
statistic, p_value = stats.shapiro(MW)
# Significance level
sig = 0.05
if p_value > sig:
    print("It's assumed that the data follows a normal distribution")
else:
    print("It's assumed that the data don't follows a normal distribution")

It's assumed that the data don't follows a normal distribution
