In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pickle import dump
import pickle as pkl
from statsmodels.distributions.empirical_distribution import ECDF
import copy

import os
import sys

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

FS_MOL_CHECKOUT_PATH = os.path.join('/system/user/publicwork/schimunek/FS-Mol')
FS_MOL_DATASET_PATH = os.path.join('/system/user/publicdata/FS-Mol/datasets/')

os.chdir(FS_MOL_CHECKOUT_PATH)
sys.path.insert(0, FS_MOL_CHECKOUT_PATH)

from fs_mol.data import FSMolDataset, DataFold

# Train set

In [2]:
dataset = FSMolDataset.from_directory(FS_MOL_DATASET_PATH)

task_iterable_train = dataset.get_task_reading_iterable(DataFold.TRAIN)

In [3]:
# Task names
tasks_train = list()

for task in iter(task_iterable_train):
    tasks_train.append(task.name)
    
# Task names to id dict
tasks_train_id_dict = {}
for i in range(len(tasks_train)):
    tasks_train_id_dict[tasks_train[i]] = i

In [4]:
# Make mol_id by comparing canonical smiles
# Make triplett: Mol_id, Task_id, Labels

mol_ids = list()
task_ids = list()
labels = list()

train_smiles_molId_dict = dict()
id_counter = 0

fingerprints = dict()
descriptors = dict()

for task in iter(task_iterable_train):
    for mol_idx in range(len(task.samples)):
        
        if task.samples[mol_idx].smiles not in list(train_smiles_molId_dict.keys()):
            train_smiles_molId_dict[task.samples[mol_idx].smiles] = id_counter
            id_counter += 1
            
        mol_ids.append(train_smiles_molId_dict[task.samples[mol_idx].smiles])
        task_ids.append(tasks_train_id_dict[task.name])
        labels.append(task.samples[mol_idx].bool_label)
        
        if task.samples[mol_idx].smiles not in list(fingerprints.keys()):
            fingerprints[task.samples[mol_idx].smiles] = task.samples[mol_idx].fingerprint
            descriptors[task.samples[mol_idx].smiles] = task.samples[mol_idx].descriptors
        
            
            

In [5]:
# Make numpy arrays for fingerprints and descriptors

fingerprints_temp = dict()
for key,value in zip(fingerprints.keys(),fingerprints.values()):
    fingerprints_temp[train_smiles_molId_dict[key]] = value

descriptors_temp = dict()
for key,value in zip(descriptors.keys(),descriptors.values()):
    descriptors_temp[train_smiles_molId_dict[key]] = value

fingerprints = np.array(list(fingerprints_temp.values()))
descriptors = np.array(list(descriptors_temp.values()))

In [6]:
# Compute quantils for descriptors
descriptors_raw_forECDF = copy.deepcopy(descriptors)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/descriptors_raw_forECDF.npy', descriptors_raw_forECDF)

descriptors_quantils = np.zeros_like(descriptors_raw_forECDF)

for column in range(descriptors_raw_forECDF.shape[1]):
    raw_values = descriptors_raw_forECDF[:,column].reshape(-1)
    ecdf = ECDF(raw_values)
    quantils = ecdf(raw_values)
    
    descriptors_quantils[:, column] = quantils
    
    

In [7]:
# Make numpy array: mol_inputs
mol_inputs = np.hstack([fingerprints, descriptors_quantils])

print(mol_inputs.shape)
mol_inputs[0:5,:]

(216827, 2248)


array([[0.        , 2.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.99901307],
       [0.        , 0.        , 0.        , ..., 0.9312678 , 0.98425013,
        0.99901307],
       [0.        , 1.        , 0.        , ..., 0.9312678 , 0.9554345 ,
        0.99901307],
       [0.        , 0.        , 0.        , ..., 0.9312678 , 0.97377634,
        0.99901307],
       [0.        , 0.        , 0.        , ..., 0.9312678 , 0.97377634,
        0.99901307]])

In [8]:
# Normalize mol_inputs and save scaler
mol_inputs[mol_inputs.astype('str') == 'nan'] = 0
mol_inputs[mol_inputs.astype('str') == 'inf'] = 0

scaler = StandardScaler()
scaler.fit(mol_inputs)
dump(scaler, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/scaler_trainFitted.pkl', 'wb'))

In [9]:
mol_inputs = scaler.transform(mol_inputs)

In [10]:
# Active dict
triplett_ds = pd.DataFrame({'mol':mol_ids,
                            'task':task_ids,
                            'labels':labels})

task_actives = dict()
task_inactives = dict()

for task in np.unique(task_ids):
    subset_task = triplett_ds[triplett_ds['task'] == task]
    subset_actives = subset_task[subset_task['labels'] == True]
    subset_inactives = subset_task[subset_task['labels'] == False]
    
    set_actives = list(subset_actives['mol'])
    set_inactives = list(subset_inactives['mol'])
    if len(set_actives) == 0:
        raise ValueError('Active set: Empty list!')
    if len(set_inactives) == 0:
        raise ValueError('Inactive set: Empty list!')
    
    task_actives[task] = set_actives
    task_inactives[task] = set_inactives
    

In [11]:
# Save files
# molecular features
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/mol_inputs.npy', mol_inputs)

# Tripletts
mol_ids = np.array(mol_ids).reshape(-1,1)
task_ids = np.array(task_ids).reshape(-1,1)
labels = np.array(labels).reshape(-1,1)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/mol_ids.npy', mol_ids)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/task_ids.npy', task_ids)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/labels.npy', labels)

# Dicts
dump(tasks_train_id_dict, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/'
                               'dict_task_names_id.pkl', 'wb'))
dump(train_smiles_molId_dict, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/'
                               'dict_mol_smiles_id.pkl', 'wb'))
dump(task_actives, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/'
                               'dict_task_id_activeMolecules.pkl', 'wb'))
dump(task_inactives, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/'
                               'dict_task_id_inactiveMolecules.pkl', 'wb'))

# Val set

In [12]:
dataset = FSMolDataset.from_directory(FS_MOL_DATASET_PATH)
task_iterable = dataset.get_task_reading_iterable(DataFold.VALIDATION)

In [13]:
# Task names
tasks = list()

for task in iter(task_iterable):
    tasks.append(task.name)
    
# Task names to id dict
tasks_id_dict = {}
for i in range(len(tasks)):
    tasks_id_dict[tasks[i]] = i

In [14]:
# Make mol_id by comparing canonical smiles
# Make triplett: Mol_id, Task_id, Labels

mol_ids = list()
task_ids = list()
labels = list()

smiles_molId_dict = dict()
id_counter = 0

fingerprints = dict()
descriptors = dict()

for task in iter(task_iterable):
    for mol_idx in range(len(task.samples)):
        
        if task.samples[mol_idx].smiles not in list(smiles_molId_dict.keys()):
            smiles_molId_dict[task.samples[mol_idx].smiles] = id_counter
            id_counter += 1
            
        mol_ids.append(smiles_molId_dict[task.samples[mol_idx].smiles])
        task_ids.append(tasks_id_dict[task.name])
        labels.append(task.samples[mol_idx].bool_label)
        
        if task.samples[mol_idx].smiles not in list(fingerprints.keys()):
            fingerprints[task.samples[mol_idx].smiles] = task.samples[mol_idx].fingerprint
            descriptors[task.samples[mol_idx].smiles] = task.samples[mol_idx].descriptors

In [15]:
# Make numpy array: mol_inputs

fingerprints_temp = dict()
for key,value in zip(fingerprints.keys(),fingerprints.values()):
    fingerprints_temp[smiles_molId_dict[key]] = value

descriptors_temp = dict()
for key,value in zip(descriptors.keys(),descriptors.values()):
    descriptors_temp[smiles_molId_dict[key]] = value

fingerprints = np.array(list(fingerprints_temp.values()))
descriptors = np.array(list(descriptors_temp.values()))

# Compute quantils for descriptors
descriptors_quantils = np.zeros_like(descriptors)

for column in range(descriptors_raw_forECDF.shape[1]):
    raw_values_ecdf = descriptors_raw_forECDF[:,column].reshape(-1)
    raw_values = descriptors[:,column].reshape(-1)
    
    ecdf = ECDF(raw_values_ecdf)
    quantils = ecdf(raw_values)
    
    descriptors_quantils[:, column] = quantils

mol_inputs = np.hstack([fingerprints, descriptors_quantils])

print(mol_inputs.shape)
mol_inputs[0:5,:]

(14735, 2248)


array([[0.        , 1.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ],
       [0.        , 1.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ],
       [0.        , 0.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ],
       [0.        , 0.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ],
       [0.        , 0.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ]])

In [16]:
# Normalize mol_inputs
mol_inputs[mol_inputs.astype('str') == 'nan'] = 0
mol_inputs[mol_inputs.astype('str') == 'inf'] = 0

scaler = pkl.load(open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/scaler_trainFitted.pkl',
                       'rb'))

mol_inputs = scaler.transform(mol_inputs)

In [17]:
# Active dict
triplett_ds = pd.DataFrame({'mol':mol_ids,
                            'task':task_ids,
                            'labels':labels})

task_actives = dict()
task_inactives = dict()

for task in np.unique(task_ids):
    subset_task = triplett_ds[triplett_ds['task'] == task]
    subset_actives = subset_task[subset_task['labels'] == True]
    subset_inactives = subset_task[subset_task['labels'] == False]
    
    set_actives = list(subset_actives['mol'])
    set_inactives = list(subset_inactives['mol'])
    if len(set_actives) == 0:
        raise ValueError('Active set: Empty list!')
    if len(set_inactives) == 0:
        raise ValueError('Inactive set: Empty list!')
    
    task_actives[task] = set_actives
    task_inactives[task] = set_inactives

In [18]:
# Save files
# molecular features
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/mol_inputs.npy', mol_inputs)

# Tripletts
mol_ids = np.array(mol_ids).reshape(-1,1)
task_ids = np.array(task_ids).reshape(-1,1)
labels = np.array(labels).reshape(-1,1)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/mol_ids.npy', mol_ids)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/task_ids.npy', task_ids)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/labels.npy', labels)

# Dicts
dump(tasks_id_dict, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/'
                               'dict_task_names_id.pkl', 'wb'))
dump(smiles_molId_dict, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/'
                               'dict_mol_smiles_id.pkl', 'wb'))
dump(task_actives, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/'
                               'dict_task_id_activeMolecules.pkl', 'wb'))
dump(task_inactives, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/validation/'
                               'dict_task_id_inactiveMolecules.pkl', 'wb'))

# Test set

In [19]:
dataset = FSMolDataset.from_directory(FS_MOL_DATASET_PATH)
task_iterable = dataset.get_task_reading_iterable(DataFold.TEST)

In [20]:
# Task names
tasks = list()

for task in iter(task_iterable):
    tasks.append(task.name)
    
# Task names to id dict
tasks_id_dict = {}
for i in range(len(tasks)):
    tasks_id_dict[tasks[i]] = i

In [21]:
# Make mol_id by comparing canonical smiles
# Make triplett: Mol_id, Task_id, Labels

mol_ids = list()
task_ids = list()
labels = list()

smiles_molId_dict = dict()
id_counter = 0

fingerprints = dict()
descriptors = dict()

for task in iter(task_iterable):
    for mol_idx in range(len(task.samples)):
        
        if task.samples[mol_idx].smiles not in list(smiles_molId_dict.keys()):
            smiles_molId_dict[task.samples[mol_idx].smiles] = id_counter
            id_counter += 1
            
        mol_ids.append(smiles_molId_dict[task.samples[mol_idx].smiles])
        task_ids.append(tasks_id_dict[task.name])
        labels.append(task.samples[mol_idx].bool_label)
        
        if task.samples[mol_idx].smiles not in list(fingerprints.keys()):
            fingerprints[task.samples[mol_idx].smiles] = task.samples[mol_idx].fingerprint
            descriptors[task.samples[mol_idx].smiles] = task.samples[mol_idx].descriptors

In [22]:
# Make numpy array: mol_inputs

fingerprints_temp = dict()
for key,value in zip(fingerprints.keys(),fingerprints.values()):
    fingerprints_temp[smiles_molId_dict[key]] = value

descriptors_temp = dict()
for key,value in zip(descriptors.keys(),descriptors.values()):
    descriptors_temp[smiles_molId_dict[key]] = value

fingerprints = np.array(list(fingerprints_temp.values()))
descriptors = np.array(list(descriptors_temp.values()))

# Compute quantils for descriptors
descriptors_quantils = np.zeros_like(descriptors)

for column in range(descriptors_raw_forECDF.shape[1]):
    raw_values_ecdf = descriptors_raw_forECDF[:,column].reshape(-1)
    raw_values = descriptors[:,column].reshape(-1)
    
    ecdf = ECDF(raw_values_ecdf)
    quantils = ecdf(raw_values)
    
    descriptors_quantils[:, column] = quantils

mol_inputs = np.hstack([fingerprints, descriptors_quantils])

print(mol_inputs.shape)
mol_inputs[0:5,:]

(27518, 2248)


array([[0.        , 0.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ],
       [0.        , 0.        , 0.        , ..., 0.9312678 , 0.97377634,
        0.9443289 ],
       [0.        , 1.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ],
       [0.        , 1.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ],
       [0.        , 1.        , 0.        , ..., 0.9312678 , 0.90536237,
        0.9443289 ]])

In [23]:
# Normalize mol_inputs
mol_inputs[mol_inputs.astype('str') == 'nan'] = 0
mol_inputs[mol_inputs.astype('str') == 'inf'] = 0

scaler = pkl.load(open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/scaler_trainFitted.pkl', 'rb'))

mol_inputs = scaler.transform(mol_inputs)

In [24]:
# Active dict
triplett_ds = pd.DataFrame({'mol':mol_ids,
                            'task':task_ids,
                            'labels':labels})

task_actives = dict()
task_inactives = dict()

for task in np.unique(task_ids):
    subset_task = triplett_ds[triplett_ds['task'] == task]
    subset_actives = subset_task[subset_task['labels'] == True]
    subset_inactives = subset_task[subset_task['labels'] == False]
    
    set_actives = list(subset_actives['mol'])
    set_inactives = list(subset_inactives['mol'])
    if len(set_actives) == 0:
        raise ValueError('Active set: Empty list!')
    if len(set_inactives) == 0:
        raise ValueError('Inactive set: Empty list!')
    
    task_actives[task] = set_actives
    task_inactives[task] = set_inactives

In [25]:
# Save files
# molecular features
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/mol_inputs.npy', mol_inputs)

# Tripletts
mol_ids = np.array(mol_ids).reshape(-1,1)
task_ids = np.array(task_ids).reshape(-1,1)
labels = np.array(labels).reshape(-1,1)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/mol_ids.npy', mol_ids)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/task_ids.npy', task_ids)
np.save('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/labels.npy', labels)

# Dicts
dump(tasks_id_dict, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/'
                               'dict_task_names_id.pkl', 'wb'))
dump(smiles_molId_dict, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/'
                               'dict_mol_smiles_id.pkl', 'wb'))
dump(task_actives, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/'
                               'dict_task_id_activeMolecules.pkl', 'wb'))
dump(task_inactives, open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/test/'
                               'dict_task_id_inactiveMolecules.pkl', 'wb'))

In [26]:
# Temp

In [27]:
task_ids = np.load('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/task_ids.npy')

In [28]:
task_ids

array([[ 699],
       [ 699],
       [ 699],
       ...,
       [4036],
       [4036],
       [4036]])

In [29]:
import pickle

with open('/system/user/publicdata/FS-Mol/preprocessed_usingQuantils/training/'
                               'dict_task_names_id.pkl', 'rb') as fl:
    temp = pickle.load(fl)

In [30]:
temp[list(temp.keys())[-1]]

4937

In [31]:
temp

{'CHEMBL888967': 0,
 'CHEMBL760077': 1,
 'CHEMBL4121210': 2,
 'CHEMBL3257004': 3,
 'CHEMBL946689': 4,
 'CHEMBL912145': 5,
 'CHEMBL3762432': 6,
 'CHEMBL1061933': 7,
 'CHEMBL3762500': 8,
 'CHEMBL662467': 9,
 'CHEMBL658038': 10,
 'CHEMBL3887871': 11,
 'CHEMBL1908618': 12,
 'CHEMBL1034533': 13,
 'CHEMBL3369133': 14,
 'CHEMBL3240666': 15,
 'CHEMBL2049250': 16,
 'CHEMBL2218987': 17,
 'CHEMBL3591306': 18,
 'CHEMBL744060': 19,
 'CHEMBL3215068': 20,
 'CHEMBL814293': 21,
 'CHEMBL3887026': 22,
 'CHEMBL827995': 23,
 'CHEMBL4051829': 24,
 'CHEMBL1006441': 25,
 'CHEMBL3271072': 26,
 'CHEMBL3620961': 27,
 'CHEMBL651176': 28,
 'CHEMBL815560': 29,
 'CHEMBL814407': 30,
 'CHEMBL3863230': 31,
 'CHEMBL3706148': 32,
 'CHEMBL3880707': 33,
 'CHEMBL4036688': 34,
 'CHEMBL761483': 35,
 'CHEMBL4187730': 36,
 'CHEMBL3636799': 37,
 'CHEMBL3588435': 38,
 'CHEMBL4000087': 39,
 'CHEMBL4059284': 40,
 'CHEMBL865931': 41,
 'CHEMBL900769': 42,
 'CHEMBL857155': 43,
 'CHEMBL3706381': 44,
 'CHEMBL1119323': 45,
 'CHEMBL359156