# Cargar los datos
<hr>

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../Limpieza_datos/Limpios/STAT3_limpio.csv")

# Visualizar datos
<hr>

In [3]:
df.head()

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL503946,,C[S+]([O-])CC[C@H](NC(=O)[C@@H]1Cc2cccc3c2N1C(...,IC50,'=',7600.0,nM
1,CHEMBL515762,,NC(=O)C1CCN(C(=O)[C@@H]2Cc3cccc4c3N2C(=O)[C@@H...,IC50,'=',595.0,nM
2,CHEMBL196561,,CC(=O)N[C@@H](Cc1ccc(OP(=O)(O)O)cc1)C(=O)N[C@@...,IC50,'=',1780.0,nM
3,CHEMBL4164056,,COC(=O)C1=CO[C@@H](O[C@@H]2O[C@H](C)[C@@H](O)[...,IC50,'=',79100.0,nM
4,CHEMBL575484,,CC(C)C[C@H](CCC(N)=O)NC(=O)[C@@H]1[C@@H]2C[C@@...,IC50,'=',1630.0,nM


In [4]:
df.shape

(830, 7)

# Calcular los descriptores moleculares
<hr>

In [5]:
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors


def calcular_descriptor(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = Chem.Descriptors.CalcMolDescriptors(mol, missingVal=None, silent=True)
    return descriptors
    

In [6]:
df["Descriptors"] = df["Smiles"].apply(calcular_descriptor)

In [7]:
descriptors_names = list(df["Descriptors"][0].keys())

In [8]:
values_list = []

In [9]:
for i in df["Descriptors"]:
    values_list.append(list(i.values()))

In [10]:
import numpy as np
values_list_array = np.array(values_list)

In [11]:
values_list_array.shape

(830, 210)

In [12]:
descriptors_df = pd.DataFrame(values_list_array, columns=descriptors_names)

In [13]:
descriptors_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.781086,13.781086,0.042028,-4.693935,0.132936,19.523810,618.605,587.357,618.154936,224.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.800832,13.800832,0.021560,-4.674062,0.280863,22.250000,568.523,539.291,568.172301,210.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13.610338,13.610338,0.021957,-4.732044,0.119218,16.536585,597.606,557.286,597.256364,228.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,12.886177,12.886177,0.031177,-1.730830,0.129166,47.127273,792.781,740.365,792.305194,312.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13.770126,13.770126,0.015577,-4.672301,0.148551,21.238095,606.657,563.313,606.281851,232.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
825,13.030548,13.030548,0.148041,-4.431872,0.274084,14.666667,590.528,566.336,590.186493,220.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
826,10.480593,10.480593,0.195833,-0.487870,0.561158,13.133333,416.514,384.258,416.219889,164.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
827,13.557568,13.557568,0.064919,-4.378615,0.173269,13.918367,669.664,639.424,669.231137,250.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
828,13.884730,13.884730,0.029002,-4.300272,0.036766,14.769231,1113.326,1044.782,1112.434726,418.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0


# Crear un DataFrame con descriptores
<hr>

In [14]:
df.drop(columns=["Descriptors"], inplace=True)

In [15]:
df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL503946,,C[S+]([O-])CC[C@H](NC(=O)[C@@H]1Cc2cccc3c2N1C(...,IC50,'=',7600.0,nM
1,CHEMBL515762,,NC(=O)C1CCN(C(=O)[C@@H]2Cc3cccc4c3N2C(=O)[C@@H...,IC50,'=',595.0,nM
2,CHEMBL196561,,CC(=O)N[C@@H](Cc1ccc(OP(=O)(O)O)cc1)C(=O)N[C@@...,IC50,'=',1780.0,nM
3,CHEMBL4164056,,COC(=O)C1=CO[C@@H](O[C@@H]2O[C@H](C)[C@@H](O)[...,IC50,'=',79100.0,nM
4,CHEMBL575484,,CC(C)C[C@H](CCC(N)=O)NC(=O)[C@@H]1[C@@H]2C[C@@...,IC50,'=',1630.0,nM
...,...,...,...,...,...,...,...
825,CHEMBL5200334,,O=C(c1ccc(-c2cn(-c3ccc(C(F)(F)F)cc3)nn2)nc1)N1...,IC50,'=',112.7,nM
826,CHEMBL5291402,,COc1ccc(/C=C/[C@@H](C[C@@H](O)CCc2ccc(OC)c(OC)...,IC50,'=',12900.0,nM
827,CHEMBL5169762,,Cn1c(C(=O)N2CCN(Cc3ccc(OCC(F)(F)F)cc3)CC2)cc2c...,IC50,'=',216.5,nM
828,CHEMBL5279856,,CCCC[C@H](NC(=O)COCCOCCOCCNC(=O)c1cccc(S(=O)(=...,IC50,'=',35.0,nM


In [16]:
descriptors_cdf = pd.concat((df, descriptors_df), axis=1)
descriptors_cdf 

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL503946,,C[S+]([O-])CC[C@H](NC(=O)[C@@H]1Cc2cccc3c2N1C(...,IC50,'=',7600.0,nM,13.781086,13.781086,0.042028,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CHEMBL515762,,NC(=O)C1CCN(C(=O)[C@@H]2Cc3cccc4c3N2C(=O)[C@@H...,IC50,'=',595.0,nM,13.800832,13.800832,0.021560,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CHEMBL196561,,CC(=O)N[C@@H](Cc1ccc(OP(=O)(O)O)cc1)C(=O)N[C@@...,IC50,'=',1780.0,nM,13.610338,13.610338,0.021957,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,CHEMBL4164056,,COC(=O)C1=CO[C@@H](O[C@@H]2O[C@H](C)[C@@H](O)[...,IC50,'=',79100.0,nM,12.886177,12.886177,0.031177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHEMBL575484,,CC(C)C[C@H](CCC(N)=O)NC(=O)[C@@H]1[C@@H]2C[C@@...,IC50,'=',1630.0,nM,13.770126,13.770126,0.015577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
825,CHEMBL5200334,,O=C(c1ccc(-c2cn(-c3ccc(C(F)(F)F)cc3)nn2)nc1)N1...,IC50,'=',112.7,nM,13.030548,13.030548,0.148041,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
826,CHEMBL5291402,,COc1ccc(/C=C/[C@@H](C[C@@H](O)CCc2ccc(OC)c(OC)...,IC50,'=',12900.0,nM,10.480593,10.480593,0.195833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
827,CHEMBL5169762,,Cn1c(C(=O)N2CCN(Cc3ccc(OCC(F)(F)F)cc3)CC2)cc2c...,IC50,'=',216.5,nM,13.557568,13.557568,0.064919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
828,CHEMBL5279856,,CCCC[C@H](NC(=O)COCCOCCOCCNC(=O)c1cccc(S(=O)(=...,IC50,'=',35.0,nM,13.884730,13.884730,0.029002,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0


# Guardar DataFrame
<hr>

In [17]:
descriptors_cdf.to_csv("Dataframes_descriptores/des_STAT3.csv", index=False)