# Do initial filtering on raw ChEMBL datasets to generate the "raw" datasets to be provided to tutorial users

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from atomsci.ddm.utils.struct_utils import mol_wt_from_smiles, base_smiles_from_smiles
from atomsci.ddm.utils.data_curation_functions import compute_negative_log_responses, standardize_relations
from atomsci.ddm.utils.curate_data import remove_outlier_replicates, aggregate_assay_data
from atomsci.ddm.pipeline import model_pipeline as mp
from atomsci.ddm.pipeline import parameter_parser as parse


2024-02-27 22:49:10.986131: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-27 22:49:11.034070: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-27 22:49:11.034155: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-27 22:49:11.034211: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-27 22:49:11.044520: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-27 22:49:11.046315: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

## Read the data

In [2]:
slc6a2_ic50_raw_df = pd.read_csv('dataset/SLC6A2_IC50_raw.csv', sep=';')
slc6a3_ic50_raw_df = pd.read_csv('dataset/SLC6A3_IC50_raw.csv', sep=';')
slc6a4_ic50_raw_df = pd.read_csv('dataset/SLC6A4_IC50_raw.csv', sep=';')
slc6a2_ki_raw_df = pd.read_csv('dataset/SLC6A2_Ki_raw.csv', sep=';')
slc6a3_ki_raw_df = pd.read_csv('dataset/SLC6A3_Ki_raw.csv', sep=';')
slc6a4_ki_raw_df = pd.read_csv('dataset/SLC6A4_Ki_raw.csv', sep=';')

In [3]:
raw_data = dict(
    IC50=dict(
        SLC6A2=slc6a2_ic50_raw_df,
        SLC6A3=slc6a3_ic50_raw_df,
        SLC6A4=slc6a4_ic50_raw_df,
    ),
    Ki=dict(
        SLC6A2=slc6a2_ki_raw_df,
        SLC6A3=slc6a3_ki_raw_df,
        SLC6A4=slc6a4_ki_raw_df,
    )
)
endpoints = list(raw_data.keys())
targets = list(raw_data['IC50'].keys())

## Filter datasets to remove null values and units, select and rename columns

In [4]:
keep_cols = ['Molecule ChEMBL ID', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value','Standard Units']
new_cols = [col.lower().replace(' ', '_') for col in keep_cols]
colmap = dict(zip( keep_cols, new_cols))
filt_data = {}
for endpoint in endpoints:
    filt_data[endpoint] = {}
    for target in targets:
        raw_df = raw_data[endpoint][target]
        print(f"\n{target} {endpoint}: {len(raw_df)} rows")
        filt_df = raw_df[keep_cols].copy().rename(columns=colmap)
        filt_df = filt_df.dropna(axis=0, subset=['standard_units'])
        print(f"Dropped {len(raw_df) - len(filt_df)} rows with null units")
        nrows = len(filt_df)
        filt_df = filt_df.dropna(axis=0, subset=['standard_value'])
        filt_df = filt_df[filt_df.standard_value > 0.0].copy()
        print(f"Dropped {nrows - len(filt_df)} rows with missing or 0/negative values")
        nrows = len(filt_df)
        filt_df = filt_df[filt_df.standard_units.isin(['µM', 'nM']) ].copy()
        print(f"Dropped {nrows - len(filt_df)} rows with weird units")
        print(f"Row counts by units:\n{filt_df.standard_units.value_counts()}")
        filt_df['standard_relation'] = filt_df.standard_relation.replace('nan', np.nan)
        print(f"Row counts by relation:\n{filt_df.standard_relation.value_counts()}")
    
        filt_data[endpoint][target] = filt_df
        filt_file = f"dataset/{target}_{endpoint}.csv"
        filt_df.to_csv(filt_file, index=False)
        print(f"Wrote filtered raw data to {filt_file}")


SLC6A2 IC50: 3454 rows
Dropped 868 rows with null units
Dropped 6 rows with missing or 0/negative values
Dropped 1 rows with weird units
Row counts by units:
standard_units
nM    2564
µM      15
Name: count, dtype: int64
Row counts by relation:
standard_relation
'='    2395
'>'     182
'<'       2
Name: count, dtype: int64
Wrote filtered raw data to dataset/SLC6A2_IC50.csv

SLC6A3 IC50: 3412 rows
Dropped 906 rows with null units
Dropped 13 rows with missing or 0/negative values
Dropped 1 rows with weird units
Row counts by units:
standard_units
nM    2488
µM       4
Name: count, dtype: int64
Row counts by relation:
standard_relation
'='    2051
'>'     435
'<'       4
Name: count, dtype: int64
Wrote filtered raw data to dataset/SLC6A3_IC50.csv

SLC6A4 IC50: 4530 rows
Dropped 919 rows with null units
Dropped 19 rows with missing or 0/negative values
Dropped 1 rows with weird units
Row counts by units:
standard_units
nM    3590
µM       1
Name: count, dtype: int64
Row counts by relation