# Code Parallelization

## Data Set

In [1]:


import pandas as pd

from qsprpred.data import MoleculeTable

# load the data
df = pd.read_csv('../../tutorial_data/AR_LIGANDS.tsv', sep='\t')
df = df.pivot(index="SMILES", columns="accession", values="pchembl_value_Mean")
df.columns.name = None
df.reset_index(inplace=True)
mt = MoleculeTable(name="ParallelizationExample", df=df)
len(mt)

6797

## Setting `nJobs` and `chunkSize`

In [2]:
from qsprpred.data.descriptors.sets import DescriptorSet
from qsprpred.data.descriptors.fingerprints import MorganFP
from qsprpred.utils.stopwatch import StopWatch


def time_desc_calc(data: MoleculeTable, desc_set: DescriptorSet):
    if data.hasDescriptors([desc_set])[0]:
        print(f"Removing old descriptors: {desc_set}")
        data.dropDescriptors([desc_set])
    print(f"Running and timing descriptor calculation: {desc_set}")
    watch = StopWatch()
    data.addDescriptors([desc_set])
    watch.stop()


time_desc_calc(mt, MorganFP(3, 2048))

Running and timing descriptor calculation: MorganFP
Time it took: 4.071595493000132


In [3]:
mt.nJobs = 2
mt.chunkSize

3398

In [4]:
mt.nJobs = 4
mt.chunkSize

1699

In [5]:
mt.chunkSize = 100

In [6]:
mt.chunkSize

100

In [7]:
mt.nJobs

4

In [8]:
mt.nJobs = 2
mt.chunkSize

3398

In [9]:
time_desc_calc(mt, MorganFP(3, 2048))

Removing old descriptors: MorganFP
Running and timing descriptor calculation: MorganFP
Time it took: 3.478728633001083


In [10]:
mt.nJobs = 12
time_desc_calc(mt, MorganFP(3, 2048))

Removing old descriptors: MorganFP
Running and timing descriptor calculation: MorganFP
Time it took: 1.551871077001124


In [11]:
mt.chunkSize = 50
time_desc_calc(mt, MorganFP(3, 2048))

Removing old descriptors: MorganFP
Running and timing descriptor calculation: MorganFP
Time it took: 16.835393630000908


## Custom Operations

In [12]:
def processing_function(props: dict, *args, **kwargs):
    print(args)
    print(kwargs)
    for prop in props:
        print(prop, props[prop][0])


mt.nJobs = 2

mt.apply(processing_function, func_args=("A",), func_kwargs={"B": None})

<generator object PandasDataTable.apply at 0x7fbc523ad460>

In [13]:
results = []
for result in mt.apply(processing_function, func_args=("A",), func_kwargs={"B": None}):
    results.append(result)
results

('A',)
{'B': None}
SMILES ('A',)Brc1cc(Nc2nc3c(ncnc3N3CCCC3)s2)ccc1
{'B': None}P0DMS8
 SMILES5.89 
COc1cc(-n2c(=O)n(-c3c(OC)cccc3)c3c2nc(NC2CC2)nc3)ccc1P29274
 P0DMS86.61 
nanP29275
 P29274nan 
5.29P30542
 P29275nan 
nanQSPRID
 P30542ParallelizationExample_0000 
5.9QSPRID 
('A',)ParallelizationExample_3398

{'B': None}
SMILES c1nc2c(nc(Nc3ccc(N4CCOCC4)cc3)nc2NC2CCCCCCC2)[nH]1
P0DMS8 5.56
P29274 nan
P29275 nan
P30542 nan
QSPRID ParallelizationExample_6796


[None, None, None]

In [14]:
def processing_function_df(props: pd.DataFrame):
    return props.shape


results = []
for result in mt.apply(processing_function_df, as_df=True):
    results.append(result)
results

[(3398, 6), (3398, 6), (1, 6)]

### Molecule Processors

In [15]:
from qsprpred.data.processing.mol_processor import MolProcessor
from rdkit.Chem import Mol
from typing import Any


class MyProcessor(MolProcessor):
    def __call__(self, mols: list[str | Mol], props: dict[str, list[Any]], *args,
                 **kwargs) -> Any:
        return mols[0], type(mols[0]), *props.keys()

    @property
    def supportsParallel(self) -> bool:
        return True


results = []
for result in mt.processMols(MyProcessor()):
    results.append(result)
results

[('Brc1cc(Nc2nc3c(ncnc3N3CCCC3)s2)ccc1',
  str,
  'SMILES',
  'P0DMS8',
  'P29274',
  'P29275',
  'P30542',
  'QSPRID'),
 ('COc1cc(-n2c(=O)n(-c3c(OC)cccc3)c3c2nc(NC2CC2)nc3)ccc1',
  str,
  'SMILES',
  'P0DMS8',
  'P29274',
  'P29275',
  'P30542',
  'QSPRID'),
 ('c1nc2c(nc(Nc3ccc(N4CCOCC4)cc3)nc2NC2CCCCCCC2)[nH]1',
  str,
  'SMILES',
  'P0DMS8',
  'P29274',
  'P29275',
  'P30542',
  'QSPRID')]

In [16]:
results = []
for result in mt.processMols(MyProcessor(), as_rdkit=True):
    results.append(result)
results

[(<rdkit.Chem.rdchem.Mol at 0x7fbc520fac50>,
  rdkit.Chem.rdchem.Mol,
  'SMILES',
  'P0DMS8',
  'P29274',
  'P29275',
  'P30542',
  'QSPRID'),
 (<rdkit.Chem.rdchem.Mol at 0x7fbc522549a0>,
  rdkit.Chem.rdchem.Mol,
  'SMILES',
  'P0DMS8',
  'P29274',
  'P29275',
  'P30542',
  'QSPRID'),
 (<rdkit.Chem.rdchem.Mol at 0x7fbc52255300>,
  rdkit.Chem.rdchem.Mol,
  'SMILES',
  'P0DMS8',
  'P29274',
  'P29275',
  'P30542',
  'QSPRID')]

In [17]:
from qsprpred.data.processing.mol_processor import MolProcessorWithID


class MyProcessorWithID(MolProcessorWithID):
    def __call__(self, mols: list[str | Mol], props: dict[str, list[Any]], *args,
                 **kwargs) -> Any:
        return mols[0], type(mols[0]), props[self.idProp][0]

    @property
    def supportsParallel(self) -> bool:
        return True


results = []
for result in mt.processMols(MyProcessorWithID(), as_rdkit=True):
    results.append(result)
results

[(<rdkit.Chem.rdchem.Mol at 0x7fbc52256250>,
  rdkit.Chem.rdchem.Mol,
  'ParallelizationExample_3398'),
 (<rdkit.Chem.rdchem.Mol at 0x7fbc522576f0>,
  rdkit.Chem.rdchem.Mol,
  'ParallelizationExample_0000'),
 (<rdkit.Chem.rdchem.Mol at 0x7fbc52255ad0>,
  rdkit.Chem.rdchem.Mol,
  'ParallelizationExample_6796')]