In [1]:
!pip install rdkit
!pip install mrl-pypi
!pip install catboost

!pip uninstall -y rdkit
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5
Collecting mrl-pypi
  Downloading mrl_pypi-0.1.5-py3-none-any.whl (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.9/109.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting selfies>=2.0.0 (from mrl-pypi)
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Collecting rdkit-pypi (from mrl-pypi)
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.8.0->mrl-pypi)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manyl

In [None]:
from rdkit.Chem import Descriptors
from catboost import CatBoostRegressor
from rdkit.Chem.rdFingerprintGenerator import GetRDKitFPGenerator
from rdkit.DataStructs import TanimotoSimilarity
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Contrib.SA_Score import sascorer

from mrl.combichem import *
from mrl.imports import *
from mrl.core import *
from mrl.chem import *
from mrl.templates.all import *
from mrl.torch_imports import *
from mrl.torch_core import *
from mrl.layers import *
from mrl.dataloaders import *
from mrl.g_models.all import *
from mrl.vocab import *
from mrl.policy_gradient import *
from mrl.train.all import *
from mrl.model_zoo import *

import pandas as pd
import numpy as np

## Фильтры

In [3]:
class NumAtomFilter():

    def __init__(self, atoms=['O', 'N', 'P', 'S'], n=12):
        self.atoms = atoms
        self.n = n
        self.name = "NumAtomFilter"

    def __call__(self, mols, with_score=False):
        return maybe_parallel(self.check, to_mols(mols))

    def check(self, mol):
        d = defaultdict(lambda : 0)
        for atom in mol.GetAtoms():
            d[atom.GetAtomicNum()] += 1

        for key in d.keys():
            if key not in [6, 1, 8, 7, 15, 16]:
                return False

        if len(set(d.keys())) < 2:

            return False

        if d[8] + d[7] + d[15] + d[16] > 12:

            return False

        return True


class HeavyAtomFilter():

    def __init__(self, n=500):
        self.n = n
        self.name = 'wt'

    def __call__(self, mols, with_score=False):
         return maybe_parallel(self.check, to_mols(mols))

    def check(self, mol):
        if Descriptors.HeavyAtomMolWt(mol) > self.n:
            return False

        return True


class SorcerFilter():

    def __init__(self, n=5):
        self.n = n
        self.name = 'sas'

    def __call__(self, mols, with_score=False):
        return maybe_parallel(self.check, to_mols(mols))


    def check(self, mol):
        if sascorer.calculateScore(mol) >= self.n:
            return False

        return True

class CatBoostFilter():

    def __init__(self, model_path):
        self.model = CatBoostRegressor()
        self.model.load_model(model_path)
        self.name = 'cb'
        self.descriptions = []
        for desc in Descriptors._descList:
            self.descriptions.append(desc[1])
        self.mols = []

    def __call__(self, mols, with_score=False):
        data = self.prepare(mols)
        res = self.model.predict(data)
        return res.tolist()

    def f(self, x):
        return maybe_parallel(x, self.mols)

    def prepare(self, mols):
        self.mols = to_mols(mols)
        data = maybe_parallel(self.f, self.descriptions)
        data = np.stack(data)
        return data.T

class SumTanimotoSimilarityPenalty():

    def __init__(self):
        self.fpgen = GetRDKitFPGenerator()
        self.mols = []
        self.name = "SumTanimotoSimilarityPenalty"

    def __call__(self, mols, with_score=False):
        mols = to_mols(mols)
        self.mols = maybe_parallel(self.fpgen.GetFingerprint, mols)

        if not isinstance(mols, list):
            return 0

        res = maybe_parallel(self.check, self.mols)
        return res

    def check(self, mol):
        if (mol is None):
            return -10
        a = []
        for mol2 in self.mols:
            if (mol2 is None) or mol2 == 0:
                a.append(0)
            else:
                a.append(TanimotoSimilarity(mol, mol2))
        return -np.mean(a)

## Алгоритм

Основная идея: взять каркас молекул с сам хорошим lgK и менять их ответвления местами, получая новые молекулы

In [6]:
df = pd.read_csv("start.csv")

In [16]:
class PermutationsAlgorithm:

    def __init__(self, start_mols):
        start_mols = to_mols(start_mols)
        radicals = []
        bases_change = []
        bases = []

        for mol in start_mols:
            core = MurckoScaffold.GetScaffoldForMol(mol) # получение базы для молекулы
            tmp = Chem.AllChem.ReplaceCore(mol, core) # получение ответвлений  от базы

            if not(tmp is None):
                rad = maybe_parallel(lambda x: x.replace(x[x.find('['):x.find('*]')+2], ""), to_smiles(Chem.GetMolFrags(tmp, asMols=True)))
                bases_change.append(rad)
                bases.append(mol)
                radicals += rad

        radicals = to_mols(list(set(radicals)))
        filtered = []
        for i in radicals:
            if not(i is None):
                filtered.append(i)
        self.bases_change = bases_change
        self.bases = bases
        self.radicals = filtered


    def mutate(self, index, k):
        result = []
        for ch in self.bases_change[index]:
            chm = to_mols(ch)
            if chm is None:
                continue
            for _ in range(k):
                result.append(Chem.ReplaceSubstructs(self.bases[index], chm, random.choice(self.radicals), replaceAll=True)[0])
        return result

    def process(self, k):
        out = []
        for i in range(len(self.bases)):
            out += self.mutate(i, k)

        filtered = []
        for i in out:
            if not(i is None):
                try:
                    Chem.SanitizeMol(i)
                except:
                    pass
                else:
                    filtered.append(i)

        return filtered

In [17]:
alg = PermutationsAlgorithm(df['smiles'].tolist())

In [18]:
out = alg.process(10)

Данный блокнот не закончен, а алгоритм не полностью проверен на ошибки (на самом деле, его скорее надо полностью переписывать). Выше представлена первая реализация, которая непосредственно использовалась во время соревнования.

Выход алгоритма также обрабатывался с помощью функций, представленных в блокноте Isomers.ipynb, а затем ранжировался по lgK