# Adding New Descriptors

You can customize existing descriptors or add new ones. You just need to adhere to the `DescriptorSet` interface:

In [1]:
from typing import Any

import numpy as np
from rdkit.Chem import Mol

from qsprpred.data.descriptors.sets import DescriptorSet


class MyDescriptor(DescriptorSet):
    """My custom descriptor set that counts occurrences for given atoms."""

    def __init__(
            self,
            atom_types: list[str] = ('C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I')
    ):
        super().__init__()
        self.atom_types = atom_types

    @property
    def descriptors(self):
        """Returns the descriptors provided by this descriptor set (by the `self.getDescriptors` method).
        Make sure that the order of this list always corresponds to the order of the columns of the
        numpy array returned by `self.getDescriptors`.
        """
        return self.atom_types

    @descriptors.setter
    def descriptors(self, types: list[str]):
        """Sets what descriptors to calculate. It is also used during 
        feature selection to instruct the descriptor set about the selected features.
        """
        self.atom_types = types

    def __str__(self):
        """This is how our descriptor set will be called, try  to choose  a unique name."""
        return "AtomTypeCounter"

    def getDescriptors(self, mols: list[Mol], props: dict[str, list[Any]], *args,
                       **kwargs) -> np.ndarray:
        """Implements a simple algorithm to count occurences of the types specified in `self.atom_types`"""
        ret = np.zeros((len(mols), len(self.atom_types)))
        for mol in mols:
            for atom in mol.GetAtoms():
                # Get the element symbol of the atom
                symbol = atom.GetSymbol()
                # Get the index of the symbol in the list of atom types
                if symbol in self.atom_types:
                    index = self.atom_types.index(symbol)
                    # Increment the counter for the atom type
                    ret[mols.index(mol), index] += 1
        return ret

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


Now we can use this descriptor as any other:

In [2]:
import os
from qsprpred.data import MoleculeTable
import pandas as pd

# Load the dataset
df = pd.read_csv('../../tutorial_data/A2A_LIGANDS.tsv', sep='\t')
dataset = MoleculeTable(
    df=df,
    store_dir="tutorial_output/data",
    name="QuickStartDataset",
    random_state=42,
    n_jobs=os.cpu_count(),  # Calculation will be distributed over available CPUs
)
# Add the descriptor
dataset.addDescriptors([MyDescriptor()])
dataset.getDescriptors()

Unnamed: 0_level_0,C,N,O,F,P,S,Cl,Br,I
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
QuickStartDataset_0000,19.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
QuickStartDataset_0001,18.0,4.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0
QuickStartDataset_0002,21.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
QuickStartDataset_0003,28.0,6.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0
QuickStartDataset_0004,24.0,5.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
QuickStartDataset_4077,19.0,6.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
QuickStartDataset_4078,18.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
QuickStartDataset_4079,12.0,8.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
QuickStartDataset_4080,20.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
