In [None]:
from openpharmacophore.databases.pubchem import PubChem
import pandas as pd

## Split assay results
Split assay results into active and inactive compounds and get smiles for each compound

In [8]:
# Download assay results
results = pubchem.get_assay_results(assay_id=713, format="csv")
results

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Qualifier,IC50,Rsquare,...,12.5 uM POC,6.25 uM POC,3.125 uM POC,1.5625 uM POC,50 uM 620 nm FOC,25 uM 620 nm FOC,12.5 uM 620 nm FOC,6.25 uM 620 nm FOC,3.125 uM 620 nm FOC,1.5625 uM 620 nm FOC
0,RESULT_TYPE,,,,,,,STRING,FLOAT,FLOAT,...,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
1,RESULT_DESCR,,,,,,,"One of =, < or >, identifying if the IC50 was ...",Qualified IC50 in micromolar. The concentrati...,"This value indicates the goodness-of-fit, meas...",...,Normalized percent of control at 12.5 uM conce...,Normalized percent of control at 6.25 uM conce...,Normalized percent of control at 3.125 uM conc...,Normalized percent of control at 1.5625 uM con...,Fold of 620 nm signal over DMSO control at 50 ...,Fold of 620 nm signal over DMSO control at 25 ...,Fold of 620 nm signal over DMSO control at 12....,Fold of 620 nm signal over DMSO control at 6.2...,Fold of 620 nm signal over DMSO control at 3.1...,Fold of 620 nm signal over DMSO control at 1.5...
2,RESULT_UNIT,,,,,,,,MICROMOLAR,NONE,...,PERCENT,PERCENT,PERCENT,PERCENT,RATIO,RATIO,RATIO,RATIO,RATIO,RATIO
3,RESULT_IS_ACTIVE_CONCENTRATION,,,,,,,,TRUE,,...,,,,,,,,,,
4,RESULT_ATTR_CONC_MICROMOL,,,,,,,,,,...,12.5,6.25,3.125,1.5625,50,25,12.5,6.25,3.125,1.5625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,435,14746684.0,2530343.0,Active,100.0,,,<,1.5625,0.44,...,56.823,67.253,84.337,35.293,0.89,0.89,0.91,0.95,1.02,2.09
440,436,14746697.0,2566372.0,Active,93.0,,,=,5.19454,1,...,35.233,46.48,59.583,70.967,0.72,0.84,0.79,0.91,1.04,1.11
441,437,14746743.0,5126017.0,Active,93.0,,,=,4.98294,1,...,11.24,31.49,85.177,99.53,1.11,1.26,1.2,1.18,1.09,1.03
442,438,14746975.0,2477701.0,Active,85.0,,,=,8.73614,1,...,39.163,61.33,87.403,102.837,0.81,0.79,0.85,0.91,0.99,1.07


In [23]:
# Keep only cid and activity columns
df = results[["PUBCHEM_CID", "PUBCHEM_ACTIVITY_OUTCOME"]]
df = df.dropna()
print(df["PUBCHEM_ACTIVITY_OUTCOME"].unique())
print(df.shape)

['Inactive' 'Active']
(439, 2)


In [32]:
# Split into active/inactive
actives = df.loc[df["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"]
actives = actives.drop("PUBCHEM_ACTIVITY_OUTCOME", axis=1)
# Cast to int
actives = actives.astype("int32")

inactives = df.loc[df["PUBCHEM_ACTIVITY_OUTCOME"] == "Inactive"]
inactives = inactives.drop("PUBCHEM_ACTIVITY_OUTCOME", axis=1)
inactives = inactives.astype("int32")
inactives.head()

Unnamed: 0,PUBCHEM_CID
5,663426
6,9550244
8,644597
9,644708
10,644742


In [35]:
actives_list = actives["PUBCHEM_CID"].tolist()
inactives_list = inactives["PUBCHEM_CID"].tolist()

In [42]:
actives_smiles = []
inactives_smiles = []

for compound in actives_list:
    smiles = pubchem.get_compound_smiles(compound)
    actives_smiles.append(smiles)
    
for compound in inactives_list:
    smiles = pubchem.get_compound_smiles(compound)
    inactives_smiles.append(smiles)
    
print(actives_smiles[0:5])
print(inactives_smiles[0:5])

['CC1=CC=C(C=C1)C2=NN=C(O2)CSC3=NC4=C(C5=C(N4)C=CC(=C5)OC)N=N3', 'CC1=CC2=C(C(C(=C(O2)N)C#N)C3=CC4=C(C(=C3)OC)OCO4)C(=O)O1', 'CCC1=CC2=C(C=C1)NC(=O)C(=C2)CN(CC3=CC4=C(C=C3)OCO4)C(=O)C5=CC=CO5', 'COC1=CC2=C(C=C1)NC3=C2CCN4C3=NC5=CC=CC=C5C4=O', 'CC1=NN=C(S1)NC(=O)CC2=CC=CS2']
['CCOC1=CC=CC(=C1)C2=NN=C3N2N=C(S3)C4=CC(=C(C(=C4)OC)OC)OC', 'CC1=CC(=NN1CC2=C(C=CC(=C2)C=C3CCC4=C(C3=O)C=CC(=C4)OC)OC)C', 'C1CCN(CC1)S(=O)(=O)C2=CC=C(C=C2)C3=NC(=NN3)SCC(=O)NC4=NC=CS4', 'C1=CC=C2C(=C1)C=C(C=N2)NC(=O)C3=CC=CO3', 'COC1=CC=C(C=C1)C2=NC(=C(C(=C2)C3=CC=CS3)C#N)SCC(=O)NCC4=CC=CO4']


In [43]:
print("Number of actives: {}".format(len(actives_smiles)))
print("Number of inactives: {}".format(len(inactives_smiles)))

Number of actives: 221
Number of inactives: 218
