# Cargar los datos
<hr>

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../Limpieza_datos/Limpios/SRC_limpio.csv")

# Visualizar datos
<hr>

In [3]:
df.head()

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL101917,,COc1cc2ncc(C#N)c(Nc3ccc(Cl)cc3Cl)c2cc1OC,IC50,'=',5200.0,nM
1,CHEMBL295902,,COC(=O)C(=O)C(=O)c1ccc(CC(NC(C)=O)C(=O)N[C@@H]...,IC50,'=',8600.0,nM
2,CHEMBL2111833,,CC(=O)N[C@@H](Cc1ccc(OP(=O)(O)O)cc1)C(=O)N[C@H...,IC50,'=',100.0,nM
3,CHEMBL77303,,Oc1ccc2c(ncn2-c2ccccc2)c1O,IC50,'=',460.0,nM
4,CHEMBL128705,,COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCOCC4)c(OC)cc23)...,IC50,'=',0.95,nM


In [4]:
df.shape

(2924, 7)

# Calcular los descriptores moleculares
<hr>

In [5]:
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors


def calcular_descriptor(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = Chem.Descriptors.CalcMolDescriptors(mol, missingVal=None, silent=True)
    return descriptors
    

In [6]:
df["Descriptors"] = df["Smiles"].apply(calcular_descriptor)

In [7]:
descriptors_names = list(df["Descriptors"][0].keys())

In [8]:
values_list = []

In [9]:
for i in df["Descriptors"]:
    values_list.append(list(i.values()))

In [10]:
import numpy as np
values_list_array = np.array(values_list)

In [11]:
values_list_array.shape

(2924, 210)

In [12]:
descriptors_df = pd.DataFrame(values_list_array, columns=descriptors_names)

In [13]:
descriptors_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,9.456555,9.456555,0.380312,0.380312,0.682131,10.360000,374.227,361.123,373.038482,126.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.308309,13.308309,0.045593,-1.625766,0.036251,13.936170,663.681,622.353,663.275172,258.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13.521368,13.521368,0.018500,-4.697674,0.190481,18.214286,601.637,561.317,601.255302,228.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.692648,9.692648,0.156656,-0.183166,0.626393,10.823529,226.235,216.155,226.074228,84.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.746687,9.746687,0.388472,0.388472,0.348288,13.914286,561.864,535.656,560.082595,184.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,11.623111,11.623111,0.520255,-3.348954,0.376574,14.857143,508.673,480.449,508.171516,182.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2920,11.762194,11.762194,0.036254,-3.871531,0.378871,22.617647,483.550,458.350,483.157640,178.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2921,13.054358,13.054358,0.097754,-4.539847,0.371853,14.692308,533.558,507.350,533.215093,200.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2922,13.039363,13.039363,0.065499,-0.065499,0.592569,10.925926,356.429,336.269,356.163711,134.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Crear un DataFrame con descriptores
<hr>

In [14]:
df.drop(columns=["Descriptors"], inplace=True)

In [15]:
df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL101917,,COc1cc2ncc(C#N)c(Nc3ccc(Cl)cc3Cl)c2cc1OC,IC50,'=',5200.00,nM
1,CHEMBL295902,,COC(=O)C(=O)C(=O)c1ccc(CC(NC(C)=O)C(=O)N[C@@H]...,IC50,'=',8600.00,nM
2,CHEMBL2111833,,CC(=O)N[C@@H](Cc1ccc(OP(=O)(O)O)cc1)C(=O)N[C@H...,IC50,'=',100.00,nM
3,CHEMBL77303,,Oc1ccc2c(ncn2-c2ccccc2)c1O,IC50,'=',460.00,nM
4,CHEMBL128705,,COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCOCC4)c(OC)cc23)...,IC50,'=',0.95,nM
...,...,...,...,...,...,...,...
2919,CHEMBL4846921,,CCN1CCN(c2ccc(Nc3ncc4scc(-c5cccc(NS(C)(=O)=O)c...,IC50,'=',3.40,nM
2920,CHEMBL5280361,,C#CCO[C@@H]1[C@H](C)[C@@H](COS(=O)(=O)C=C)O[C@...,IC50,'=',6.80,nM
2921,CHEMBL4787515,,Cc1ccc(NC(=O)c2cccc(C(F)(F)F)c2)cc1C#Cc1nn(C2C...,IC50,'=',2.00,nM
2922,CHEMBL359898,,Cc1cccc(C)c1-c1cc2cnc(Nc3ccccc3)nc2n(C)c1=O,IC50,'=',14.00,nM


In [16]:
descriptors_cdf = pd.concat((df, descriptors_df), axis=1)
descriptors_cdf 

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL101917,,COc1cc2ncc(C#N)c(Nc3ccc(Cl)cc3Cl)c2cc1OC,IC50,'=',5200.00,nM,9.456555,9.456555,0.380312,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CHEMBL295902,,COC(=O)C(=O)C(=O)c1ccc(CC(NC(C)=O)C(=O)N[C@@H]...,IC50,'=',8600.00,nM,13.308309,13.308309,0.045593,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CHEMBL2111833,,CC(=O)N[C@@H](Cc1ccc(OP(=O)(O)O)cc1)C(=O)N[C@H...,IC50,'=',100.00,nM,13.521368,13.521368,0.018500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CHEMBL77303,,Oc1ccc2c(ncn2-c2ccccc2)c1O,IC50,'=',460.00,nM,9.692648,9.692648,0.156656,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHEMBL128705,,COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCOCC4)c(OC)cc23)...,IC50,'=',0.95,nM,9.746687,9.746687,0.388472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,CHEMBL4846921,,CCN1CCN(c2ccc(Nc3ncc4scc(-c5cccc(NS(C)(=O)=O)c...,IC50,'=',3.40,nM,11.623111,11.623111,0.520255,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2920,CHEMBL5280361,,C#CCO[C@@H]1[C@H](C)[C@@H](COS(=O)(=O)C=C)O[C@...,IC50,'=',6.80,nM,11.762194,11.762194,0.036254,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2921,CHEMBL4787515,,Cc1ccc(NC(=O)c2cccc(C(F)(F)F)c2)cc1C#Cc1nn(C2C...,IC50,'=',2.00,nM,13.054358,13.054358,0.097754,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2922,CHEMBL359898,,Cc1cccc(C)c1-c1cc2cnc(Nc3ccccc3)nc2n(C)c1=O,IC50,'=',14.00,nM,13.039363,13.039363,0.065499,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Guardar DataFrame
<hr>

In [17]:
descriptors_cdf.to_csv("Dataframes_descriptores/des_SRC.csv", index=False)