# Cargar los datos
<hr>

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("../Limpieza_datos/Limpios/MAPK1_limpio.csv")

# Visualizar datos
<hr>

In [4]:
df.head()

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL319244,CAFFEIC ACID PHENETHYL ESTER,O=C(/C=C/c1ccc(O)c(O)c1)OCCc1ccccc1,IC50,'=',686.0,nM
1,CHEMBL259551,,Nc1n[nH]c2nnc(-c3c(-c4ccccc4)nn4ccccc34)cc12,IC50,'=',1900.0,nM
2,CHEMBL3695623,,CCNC(=O)Nc1ccc2ncc(-c3ccc(OCCCN4CCOCC4)cc3)nc2n1,IC50,'=',1800.0,nM
3,CHEMBL3662970,,Cc1cc(-c2n[nH]c3cc(NC(=O)NCc4c(C)cc(Cl)cc4Cl)n...,IC50,'=',2.868,nM
4,CHEMBL3658652,,C[C@@H](NC(=O)Nc1cc2[nH]nc(-c3ccc(=O)[nH]c3)c2...,IC50,'=',0.3037,nM


In [5]:
df.shape

(2937, 7)

# Calcular los descriptores moleculares
<hr>

In [6]:
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors


def calcular_descriptor(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = Chem.Descriptors.CalcMolDescriptors(mol, missingVal=None, silent=True)
    return descriptors
    

In [7]:
df["Descriptors"] = df["Smiles"].apply(calcular_descriptor)

In [9]:
descriptors_names = list(df["Descriptors"][0].keys())

In [10]:
descriptors_names

['MaxAbsEStateIndex',
 'MaxEStateIndex',
 'MinAbsEStateIndex',
 'MinEStateIndex',
 'qed',
 'SPS',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'AvgIpc',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',


In [11]:
values_list = []

In [12]:
for i in df["Descriptors"]:
    values_list.append(list(i.values()))

In [13]:
import numpy as np
values_list_array = np.array(values_list)

In [14]:
values_list_array.shape

(2937, 210)

In [15]:
descriptors_df = pd.DataFrame(values_list_array, columns=descriptors_names)

In [16]:
descriptors_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,11.537535,11.537535,0.196678,-0.445837,0.503108,10.666667,284.311,268.183,284.104859,108.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.929329,5.929329,0.406375,0.406375,0.519331,11.360000,327.351,314.247,327.123243,120.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.726190,11.726190,0.301854,-0.301854,0.523647,14.281250,436.516,408.292,436.222289,168.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,12.311418,12.311418,0.269358,-0.391885,0.402390,10.933333,441.322,423.178,440.091915,152.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,13.023951,13.023951,0.195296,-0.436066,0.426239,11.931034,392.394,375.258,392.139702,146.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,12.002934,12.002934,0.088378,0.088378,0.179096,11.153846,520.641,488.385,520.269908,198.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
2933,13.391756,13.391756,0.102038,-0.445583,0.419490,14.057143,488.979,463.779,488.172752,178.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2934,12.506190,12.506190,0.083726,-0.083726,0.579073,13.481481,355.401,338.265,355.143310,132.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2935,13.190911,13.190911,0.196594,-0.476820,0.449420,13.939394,463.929,441.753,463.152351,168.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Crear un DataFrame con descriptores
<hr>

In [17]:
df.drop(columns=["Descriptors"], inplace=True)

In [18]:
df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL319244,CAFFEIC ACID PHENETHYL ESTER,O=C(/C=C/c1ccc(O)c(O)c1)OCCc1ccccc1,IC50,'=',686.0000,nM
1,CHEMBL259551,,Nc1n[nH]c2nnc(-c3c(-c4ccccc4)nn4ccccc34)cc12,IC50,'=',1900.0000,nM
2,CHEMBL3695623,,CCNC(=O)Nc1ccc2ncc(-c3ccc(OCCCN4CCOCC4)cc3)nc2n1,IC50,'=',1800.0000,nM
3,CHEMBL3662970,,Cc1cc(-c2n[nH]c3cc(NC(=O)NCc4c(C)cc(Cl)cc4Cl)n...,IC50,'=',2.8680,nM
4,CHEMBL3658652,,C[C@@H](NC(=O)Nc1cc2[nH]nc(-c3ccc(=O)[nH]c3)c2...,IC50,'=',0.3037,nM
...,...,...,...,...,...,...,...
2932,CHEMBL5174190,,C#CCCCCNC(=O)CCCCCn1nc(N)c2cc(-c3c(-c4ccccc4)n...,IC50,'=',1200.0000,nM
2933,CHEMBL5086221,,Cc1cnc(Nc2ccnn2C)nc1-c1ccc2c(c1)CCN([C@H](CO)c...,IC50,'=',9.2000,nM
2934,CHEMBL4650284,KO-947,O=C1Nc2cc3[nH]nc(-c4ccncc4)c3cc2CN1Cc1ccccc1,IC50,'=',10.0000,nM
2935,CHEMBL5070887,,Cc1cnc(Nc2ccnn2C)nc1-c1cc2n(c1)C(=O)N([C@H](CO...,IC50,'=',5.9000,nM


In [19]:
descriptors_cdf = pd.concat((df, descriptors_df), axis=1)
descriptors_cdf 

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL319244,CAFFEIC ACID PHENETHYL ESTER,O=C(/C=C/c1ccc(O)c(O)c1)OCCc1ccccc1,IC50,'=',686.0000,nM,11.537535,11.537535,0.196678,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CHEMBL259551,,Nc1n[nH]c2nnc(-c3c(-c4ccccc4)nn4ccccc34)cc12,IC50,'=',1900.0000,nM,5.929329,5.929329,0.406375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CHEMBL3695623,,CCNC(=O)Nc1ccc2ncc(-c3ccc(OCCCN4CCOCC4)cc3)nc2n1,IC50,'=',1800.0000,nM,11.726190,11.726190,0.301854,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,CHEMBL3662970,,Cc1cc(-c2n[nH]c3cc(NC(=O)NCc4c(C)cc(Cl)cc4Cl)n...,IC50,'=',2.8680,nM,12.311418,12.311418,0.269358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,CHEMBL3658652,,C[C@@H](NC(=O)Nc1cc2[nH]nc(-c3ccc(=O)[nH]c3)c2...,IC50,'=',0.3037,nM,13.023951,13.023951,0.195296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,CHEMBL5174190,,C#CCCCCNC(=O)CCCCCn1nc(N)c2cc(-c3c(-c4ccccc4)n...,IC50,'=',1200.0000,nM,12.002934,12.002934,0.088378,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
2933,CHEMBL5086221,,Cc1cnc(Nc2ccnn2C)nc1-c1ccc2c(c1)CCN([C@H](CO)c...,IC50,'=',9.2000,nM,13.391756,13.391756,0.102038,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2934,CHEMBL4650284,KO-947,O=C1Nc2cc3[nH]nc(-c4ccncc4)c3cc2CN1Cc1ccccc1,IC50,'=',10.0000,nM,12.506190,12.506190,0.083726,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2935,CHEMBL5070887,,Cc1cnc(Nc2ccnn2C)nc1-c1cc2n(c1)C(=O)N([C@H](CO...,IC50,'=',5.9000,nM,13.190911,13.190911,0.196594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Guardar DataFrame
<hr>

In [20]:
descriptors_cdf.to_csv("Dataframes_descriptores/des_MAPK1.csv", index=False)