In [1]:
import sys
sys.path.append('..')

In [2]:
from logics_pack import global_settings, dataset, smiles_vocab, chemistry
import pandas as pd
import numpy as np
import json

project_paths = global_settings.build_project_paths(project_dir='../')

In [3]:
vocab_obj = smiles_vocab.Vocabulary(init_from_file=project_paths["SMILES_TOKENS_PATH"])
smtk = smiles_vocab.SmilesTokenizer(vocab_obj)

Process ChEMBL dataset

In [4]:
# chembl dataset processing
chembl_new = dataset.process_chembl(project_paths["CHEMBL_RAW_PATH"], smtk)
# write the processed chembl data
with open(project_paths["CHEMBL_DATA_PATH"], 'w') as f:
    f.writelines([line+'\n' for line in chembl_new])

Process KOR dataset

In [4]:
# KOR dataset processing
smiles, labels = dataset.process_DiverseDRL_KOR(project_paths["KOR_RAW_PATH"])
# The labels are pCHEMBL values

canons = [chemistry.convert_to_canon(smi) for smi in smiles] # canonicalized smiles
if None in canons:
    print("There are invalid SMILES in the raw KOR dataset!")

In [5]:
kor_assay_obj = dataset.PubChemProcessLOGICS()
kor_assay_obj.raw = pd.DataFrame(canons, columns=['smiles'])
kor_assay_obj.raw['pCHEMBL'] = labels
kor_assay_obj.filtered = kor_assay_obj.get_raw().copy()
kor_assay_obj.filter_del_disconnected_smiles()
kor_assay_obj.filter_del_undefined_tokens(smtk)  # warning message if undefined token detected
kor_assay_obj.filter_append_median(median_of='pCHEMBL', groupby='smiles')
kor_data = kor_assay_obj.filtered[['smiles', 'med_pCHEMBL']].drop_duplicates().reset_index(drop=True)
kor_data.rename(columns={'med_pCHEMBL':'affinity'}, inplace=True)
# write the processed KOR data
kor_data.to_csv(project_paths["KOR_DATA_PATH"], index=False)

following indices of records are dropped due to undefined tokens:
[515, 3906, 4319]


In [6]:
# fold splits for predictor training
kor_fold_dict = dataset.fold_splits(len(kor_data), global_settings.NUM_DATA_FOLDS)
with open(project_paths["KOR_FOLD_JSON"], 'w') as f:
    json.dump(kor_fold_dict, f)

In [8]:
# save morgan fingerprint features
rdk_fps = chemistry.get_fps_from_smilist(kor_data["smiles"].tolist())
np_fps = chemistry.rdk2npfps(rdk_fps)
np.save(project_paths["KOR_DATA_FP"], np_fps)

Process PIK3CA dataset

In [9]:
# PIK3CA dataset processing
pik3_assay_obj = dataset.PubChemProcessLOGICS()
pik3_assay_obj.load_json_response(project_paths["PIK3CA_RAW_PATH"])
# Ki activity only
pik3_assay_obj.filter_set_default_columns()
pik3_assay_obj.filter_set_exactly('acname', 'Ki')
pik3_ki_subset = pik3_assay_obj.filtered.copy()

# Kd activity only
pik3_assay_obj.reset_filter()
pik3_assay_obj.filter_set_default_columns()
pik3_assay_obj.filter_set_exactly('acname', 'Kd')
pik3_kd_subset = pik3_assay_obj.filtered.copy()

# Kd + Ki -> Kx
pik3_assay_obj.filtered = pik3_ki_subset.append(pik3_kd_subset, ignore_index=True)
pik3_assay_obj.filter_set_exactly('acqualifier', '=')
pik3_assay_obj.filter_del_variant()
pik3_assay_obj.filter_append_smiles_download()
pik3_assay_obj.filter_del_disconnected_smiles()
pik3_assay_obj.filter_del_undefined_tokens(smtk)

# transform the acvalue to get pKx
pik3_assay_obj.filtered['pKx'] = -np.log10(pik3_assay_obj.filtered['acvalue']*(10**-6)) # pKx

pik3_assay_obj.filter_append_median(median_of='pKx', groupby='smiles')
pik3ca_data = pik3_assay_obj.filtered[['smiles', 'med_pKx']].drop_duplicates().reset_index(drop=True)
pik3ca_data.rename(columns={'med_pKx':'affinity'}, inplace=True)

# write the processed PIK3CA data
pik3ca_data.to_csv(project_paths["PIK3CA_DATA_PATH"], index=False)

following indices of records are dropped due to undefined tokens:
[]


In [10]:
# fold splits for predictor training
pik3ca_fold_dict = dataset.fold_splits(len(pik3ca_data), global_settings.NUM_DATA_FOLDS)
with open(project_paths["PIK3CA_FOLD_JSON"], 'w') as f:
    json.dump(pik3ca_fold_dict, f)

In [10]:
# save morgan fingerprint features
rdk_fps = chemistry.get_fps_from_smilist(pik3ca_data["smiles"].tolist())
np_fps = chemistry.rdk2npfps(rdk_fps)
np.save(project_paths["PIK3CA_DATA_FP"], np_fps)