In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [52]:
%run pug.ipynb
%run sample.ipynb
%run serialze.ipynb
%run embed_voxel_acellera.ipynb

In [3]:
rorg_results = pd.read_csv('../data/pubchem/AID_2551_datatable_all.csv', low_memory=False)

In [4]:
rorg_results = rorg_results[4:].dropna(subset=['PUBCHEM_CID', 'PUBCHEM_SID'])
rorg_results = rorg_results[['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME', 'Phenotype', 'Max_Response']]
rorg_results.PUBCHEM_CID = rorg_results.PUBCHEM_CID.astype(int)

In [5]:
samp_frac_outcome = {'PUBCHEM_ACTIVITY_OUTCOME':{'Active':.5, 'Inactive':.5}}
rorg_outcome = sample_frac_max(rorg_results[['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME']], samp_frac_outcome)

In [6]:
samp_frac_phenotype = {'Phenotype':{'Activator':.33, 'Inhibitor':.33, 'Inactive':.33}}
rorg_phenotype = sample_frac_max(rorg_results[['PUBCHEM_CID', 'Phenotype']], samp_frac_phenotype)

In [7]:
unq_cids = pd.concat([rorg_phenotype.PUBCHEM_CID, rorg_outcome.PUBCHEM_CID]).drop_duplicates()
unq_smis = fetchISMILES(unq_cids)

In [8]:
unq_smis.PUBCHEM_CID = unq_smis.PUBCHEM_CID.astype(int)
rorg_outcome = pd.merge(rorg_outcome, unq_smis, on='PUBCHEM_CID', how='left')
rorg_phenotype = pd.merge(rorg_phenotype, unq_smis, on='PUBCHEM_CID', how='left')

In [60]:
lab_encode = LabelEncoder()
rorg_outcome['Y_LABEL'] = lab_encode.fit_transform(rorg_outcome.PUBCHEM_ACTIVITY_OUTCOME)
rorg_phenotype['Y_LABEL'] = lab_encode.fit_transform(rorg_phenotype.Phenotype)

In [65]:
one_encode = OneHotEncoder()
rorg_outcome['Y_ONEHOT'] = one_encode.fit_transform(rorg_outcome.PUBCHEM_ACTIVITY_OUTCOME.to_numpy().reshape(1,-1))
rorg_phenotype['Y_ONEHOT'] = one_encode.fit_transform(rorg_phenotype.Phenotype.to_numpy().reshape(1,-1))

In [None]:
label_indices = ['Y_LABEL', 'Y_ONEHOT', 'PUBCHEM_CID', 'ISOMERIC_SMILES']

outcome_labels = rorg_outcome[label_indices].to_numpy()
outcome_grids = voxelize(rorg_outcome.ISOMERIC_SMILES, nconformers=6)
outcome_x, outcome_y = expand_conformers_and_labels(outcome_grids, outcome_labels)

phenotype_labels = rorg_phenotype[label_indices].to_numpy()
phenotype_grids = voxelize(rorg_phenotype.ISOMERIC_SMILES, nconformers=20)
phenotype_x, phenotype_y = expand_conformers_and_labels(phenotype_grids, phenotype_labels)

In [None]:
serialize(outcome_x, 'outcome_x', 'data/datasets', 'feather')
serialize(outcome_y, 'outcome_y', 'data/datasets', 'feather')

serialize(phenotype_x, 'phenotype_x', 'data/datasets', 'feather')
serialize(phenotype_y, 'phenotype_y', 'data/datasets', 'feather')