In [1]:
import yaml
import pandas as pd
import sys
sys.path.append('..')

mics_only = True

In [2]:
if "mics_only":
    folder_path = '../data/chemblapi/mics/'
else:
    folder_path = '../data/chemblapi/full/'

In [4]:
#load id_name file
with open(f'{folder_path}id_name.yaml', 'r') as f:
    id_name = yaml.safe_load(f)

In [4]:
from utils import load_chembl_datasets

datasets, id_name = load_chembl_datasets(folder_path)

In [5]:
# sorting of datasets by number of rows
datasets = dict(sorted(datasets.items(), key=lambda item: item[1].shape[0], reverse=True))

# print the number of rows and columns for each dataset
counter = 0
for chembl_id, dataframe in datasets.items():
    if dataframe.shape[0] == 0:
        counter += 1
    else:
        print(f'Dataset: {dataframe.shape[0]} entries on dataset \t\t"{id_name[chembl_id]} ({chembl_id})"')
    
# print the number of datasets with 0 rows
print(f'\n\n{counter} datasets with 0 rows')

Dataset: 45668 entries on dataset 		"Mycobacterium tuberculosis (CHEMBL360)"
Dataset: 449 entries on dataset 		"Mycobacterium tuberculosis variant bovis (CHEMBL613086)"
Dataset: 266 entries on dataset 		"Mycobacterium tuberculosis variant bovis BCG (CHEMBL615052)"
Dataset: 122 entries on dataset 		"Enoyl-[acyl-carrier-protein] reductase (CHEMBL1849)"
Dataset: 59 entries on dataset 		"Mycobacterium tuberculosis H37Rv (CHEMBL2111188)"
Dataset: 21 entries on dataset 		"Mycobacterium tuberculosis variant microti (CHEMBL612960)"
Dataset: 4 entries on dataset 		"Thioredoxin reductase (CHEMBL2390811)"


86 datasets with 0 rows


In [6]:
datasets_non_empty = datasets.copy()
for chembl_id, dataframe in datasets.items():
    if dataframe.shape[0] == 0: #remove from datasets
        del datasets_non_empty[chembl_id]
        continue

    # sets id_name as the pandas dataframe name
    dataframe.name = id_name[chembl_id]
    
    # create a variable with the chembl_id as variable name
    globals()[chembl_id] = dataframe


In [7]:
#checks which datasets are organism-based
len(datasets_non_empty)

7

In [8]:
#check unique values in bao_label for each dataset and frequency of each value
bao_labels = {}
for chembl_id, dataframe in datasets_non_empty.items():
    if 'bao_label' in dataframe.columns:
        bao_labels[chembl_id] = dataframe['bao_label'].value_counts()
        print(f'\n\nDataset: {dataframe.shape[0]} entries on dataset \t\t"{id_name[chembl_id]} ({chembl_id})"')
        print(bao_labels[chembl_id])
    else:
        print(f'\n\nDataset: {dataframe.shape[0]} entries on dataset \t\t"{id_name[chembl_id]} ({chembl_id})"')
        print('No bao_label column')



Dataset: 45668 entries on dataset 		"Mycobacterium tuberculosis (CHEMBL360)"
bao_label
organism-based format    43293
assay format              2333
cell-based format           42
Name: count, dtype: int64


Dataset: 449 entries on dataset 		"Mycobacterium tuberculosis variant bovis (CHEMBL613086)"
bao_label
organism-based format    382
assay format              67
Name: count, dtype: int64


Dataset: 266 entries on dataset 		"Mycobacterium tuberculosis variant bovis BCG (CHEMBL615052)"
bao_label
organism-based format    266
Name: count, dtype: int64


Dataset: 122 entries on dataset 		"Enoyl-[acyl-carrier-protein] reductase (CHEMBL1849)"
bao_label
single protein format    122
Name: count, dtype: int64


Dataset: 59 entries on dataset 		"Mycobacterium tuberculosis H37Rv (CHEMBL2111188)"
bao_label
organism-based format    59
Name: count, dtype: int64


Dataset: 21 entries on dataset 		"Mycobacterium tuberculosis variant microti (CHEMBL612960)"
bao_label
organism-based format    21
Nam

In [None]:
#join CHEMBL360 and CHEMBL2111188

mtb_dataset = pd.concat([CHEMBL360, CHEMBL2111188], ignore_index=True)

In [34]:
mtb_dataset

Unnamed: 0.1,Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,0,,,34021,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.500
1,1,,,34115,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,1.000
2,2,,,36502,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.125
3,3,,,42561,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.125
4,4,,,44868,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,2.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45722,54,,,15777196,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,5.000
45723,55,,,15777197,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,1.300
45724,56,,,15777198,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,0.900
45725,57,,,15777199,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,1.300


In [33]:
mtb_dataset.head()

Unnamed: 0.1,Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,0,,,34021,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.5
1,1,,,34115,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,1.0
2,2,,,36502,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.125
3,3,,,42561,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.125
4,4,,,44868,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,2.0


In [35]:
#check unique values in assay_type for mtb_dataset
mtb_dataset['assay_type'].value_counts()

assay_type
F    45516
B      211
Name: count, dtype: int64

In [36]:
mtb_dataset = mtb_dataset[mtb_dataset['assay_type'] == 'F'] #keep only Functional assays

In [37]:
mtb_dataset

Unnamed: 0.1,Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,0,,,34021,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.500
1,1,,,34115,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,1.000
2,2,,,36502,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.125
3,3,,,42561,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,0.125
4,4,,,44868,[],CHEMBL715914,In vitro antibacterial activity of compound ag...,F,,,...,Mycobacterium tuberculosis,Mycobacterium tuberculosis,1773,,,MIC,ug ml-1,UO_0000274,,2.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45722,54,,,15777196,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,5.000
45723,55,,,15777197,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,1.300
45724,56,,,15777198,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,0.900
45725,57,,,15777199,[],CHEMBL3637834,GSK_TB: Minimum inhibitory concentration again...,F,,,...,Mycobacterium tuberculosis H37Rv,Mycobacterium tuberculosis H37Rv,83332,,,MIC,uM,UO_0000065,,1.300


In [38]:
#save mtb_dataset to csv
mtb_dataset.to_csv('../data/chemblapi/organism_based.csv', index=False)
