# Cargar los datos
<hr>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("../Limpieza_datos/Limpios/ESR1_limpio.csv")

# Visualizar datos
<hr>

In [5]:
df.head()

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL1479,DANAZOL,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,IC50,'=',17790.0,nM
1,CHEMBL85650,,CC(C)C[C@H]1Sc2cc(O)ccc2O[C@H]1c1ccc(OCCN2CCCC...,IC50,'=',3.0,nM
2,CHEMBL101997,,CC1(c2ccc(OCCN3CCCC3)cc2)c2ccc(O)cc2CCN1c1ccc(...,IC50,'=',31.0,nM
3,CHEMBL100763,,CC1(c2ccc(OCCN3CCCC3)cc2)c2ccc(O)cc2CCN1c1ccc(...,IC50,'=',26.98,nM
4,CHEMBL394614,,O/N=C/c1ccc(-c2ccc(O)cc2)c2occc12,IC50,'=',1330.0,nM


In [6]:
df.shape

(2586, 7)

# Calcular los descriptores moleculares
<hr>

In [7]:
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors


def calcular_descriptor(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = Chem.Descriptors.CalcMolDescriptors(mol, missingVal=None, silent=True)
    return descriptors
    

In [8]:
df["Descriptors"] = df["Smiles"].apply(calcular_descriptor)

In [9]:
descriptors_names = list(df["Descriptors"][0].keys())

In [10]:
descriptors_names

['MaxAbsEStateIndex',
 'MaxEStateIndex',
 'MinAbsEStateIndex',
 'MinEStateIndex',
 'qed',
 'SPS',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'AvgIpc',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',


In [11]:
values_list = []

In [12]:
for i in df["Descriptors"]:
    values_list.append(list(i.values()))

In [13]:
import numpy as np
values_list_array = np.array(values_list)

In [14]:
values_list_array.shape

(2586, 210)

In [15]:
descriptors_df = pd.DataFrame(values_list_array, columns=descriptors_names)

In [16]:
descriptors_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,11.075595,11.075595,0.113574,-0.908859,0.721043,47.760000,337.463,310.247,337.204179,132.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.838085,9.838085,0.002923,-0.002923,0.595975,21.833333,427.610,394.346,427.218115,162.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13.634593,13.634593,0.237762,-0.465085,0.555101,20.606061,446.566,415.318,446.236956,172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.091639,10.091639,0.317401,-0.385720,0.498357,20.606061,463.021,431.773,462.207406,172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.322322,9.322322,0.225438,0.225438,0.416230,11.368421,253.257,242.169,253.073893,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2581,15.697575,15.697575,0.018455,-1.592048,0.398779,18.685714,482.506,457.306,482.192961,182.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2582,16.086506,16.086506,0.024724,-1.615377,0.257141,17.846154,557.012,528.788,556.174055,204.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2583,13.802432,13.802432,0.163999,-4.033554,0.202413,20.048780,625.604,599.396,626.066616,204.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2584,14.713122,14.713122,0.165005,-3.456554,0.158321,14.452381,602.629,575.413,602.177488,220.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Crear un DataFrame con descriptores
<hr>

In [17]:
df.drop(columns=["Descriptors"], inplace=True)

In [18]:
df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL1479,DANAZOL,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,IC50,'=',17790.00,nM
1,CHEMBL85650,,CC(C)C[C@H]1Sc2cc(O)ccc2O[C@H]1c1ccc(OCCN2CCCC...,IC50,'=',3.00,nM
2,CHEMBL101997,,CC1(c2ccc(OCCN3CCCC3)cc2)c2ccc(O)cc2CCN1c1ccc(...,IC50,'=',31.00,nM
3,CHEMBL100763,,CC1(c2ccc(OCCN3CCCC3)cc2)c2ccc(O)cc2CCN1c1ccc(...,IC50,'=',26.98,nM
4,CHEMBL394614,,O/N=C/c1ccc(-c2ccc(O)cc2)c2occc12,IC50,'=',1330.00,nM
...,...,...,...,...,...,...,...
2581,CHEMBL5084593,,C[C@@H]1Cc2c([nH]c3ccccc23)[C@@H](c2c(F)cc(-n3...,IC50,'=',17.40,nM
2582,CHEMBL5077488,,COc1cc(C(=O)O)cc(Cl)c1-c1cc(F)c([C@@H]2c3[nH]c...,IC50,'=',32.90,nM
2583,CHEMBL5196589,,O=S(=O)(Oc1cccc2ccccc12)C1CC2OC1C(c1ccc([Se]c3...,IC50,'=',310.00,nM
2584,CHEMBL5198894,,COc1cc(Oc2c(-c3cc(F)c(F)cc3C(C)(F)F)sc3c2ccc2[...,IC50,'=',2.50,nM


In [19]:
descriptors_cdf = pd.concat((df, descriptors_df), axis=1)
descriptors_cdf 

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL1479,DANAZOL,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,IC50,'=',17790.00,nM,11.075595,11.075595,0.113574,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CHEMBL85650,,CC(C)C[C@H]1Sc2cc(O)ccc2O[C@H]1c1ccc(OCCN2CCCC...,IC50,'=',3.00,nM,9.838085,9.838085,0.002923,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CHEMBL101997,,CC1(c2ccc(OCCN3CCCC3)cc2)c2ccc(O)cc2CCN1c1ccc(...,IC50,'=',31.00,nM,13.634593,13.634593,0.237762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CHEMBL100763,,CC1(c2ccc(OCCN3CCCC3)cc2)c2ccc(O)cc2CCN1c1ccc(...,IC50,'=',26.98,nM,10.091639,10.091639,0.317401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHEMBL394614,,O/N=C/c1ccc(-c2ccc(O)cc2)c2occc12,IC50,'=',1330.00,nM,9.322322,9.322322,0.225438,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2581,CHEMBL5084593,,C[C@@H]1Cc2c([nH]c3ccccc23)[C@@H](c2c(F)cc(-n3...,IC50,'=',17.40,nM,15.697575,15.697575,0.018455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2582,CHEMBL5077488,,COc1cc(C(=O)O)cc(Cl)c1-c1cc(F)c([C@@H]2c3[nH]c...,IC50,'=',32.90,nM,16.086506,16.086506,0.024724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2583,CHEMBL5196589,,O=S(=O)(Oc1cccc2ccccc12)C1CC2OC1C(c1ccc([Se]c3...,IC50,'=',310.00,nM,13.802432,13.802432,0.163999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2584,CHEMBL5198894,,COc1cc(Oc2c(-c3cc(F)c(F)cc3C(C)(F)F)sc3c2ccc2[...,IC50,'=',2.50,nM,14.713122,14.713122,0.165005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Guardar DataFrame
<hr>

In [20]:
descriptors_cdf.to_csv("Dataframes_descriptores/des_ESR1.csv", index=False)