## Data parsing/preprocessing

In [1]:
important_fields = [
 'activity_id',
 'assay_chembl_id',
 'bao_endpoint',
 'bao_format',
 'bao_label',
 'canonical_smiles',
 'molecule_chembl_id',
 'parent_molecule_chembl_id',
 'pchembl_value',
 'potential_duplicate',
 'relation',
 'standard_flag',
 'standard_relation',
 'standard_text_value',
 'standard_type',
 'standard_units',
 'standard_upper_value',
 'standard_value',
 'target_chembl_id',
 'target_organism',
 'target_pref_name',
 'target_tax_id',
]

In [2]:
!mkdir /content/logs

In [3]:
import logging
logging.basicConfig(
    filename="/content/logs/activities_chembl_parse.log",
    level=logging.INFO,
    force = True,
    format="%(asctime)s [%(levelname)s] %(message)s"
)

In [4]:
import requests
import time
from pprint import pprint
from random import uniform


headers = {'Accept': 'application/json'}
target_id = "CHEMBL1741208"
limit = 100



def extract_features(activities):
  extract = lambda x: {key:x[key] for key in important_fields}
  objs = []
  for act in activities:
    objs.append(extract(act))
  return objs


def parse_activities_data(target_id, limit, headers):
  rows = []
  offset = 0
  logging.info('...Start...')
  while True:
    try:
      url = f'https://www.ebi.ac.uk/chembl/api/data/activity?target_chembl_id={target_id}&limit={limit}&offset={offset}'
      req = requests.get(url, headers=headers)
      if not req.json()['activities']:
        break
      rows.extend(extract_features(req.json()['activities']))
      logging.info(f'parsed {len(rows)} objects')
      offset += limit
    except Exception as ex:
      logging.info(f'parsing error: request code - {req.status_code}, exception: {ex.message}')

  return rows

rows = parse_activities_data(target_id, limit, headers)

In [5]:
len(rows)

1397

In [10]:
!mkdir /content/drive/MyDrive/datacon2025/main/nlrp

In [7]:
df_dict = {key: [] for key in rows[0].keys()}
for obj in rows:
  for key in df_dict.keys():
    df_dict[key].append(obj[key])

In [8]:
import pandas as pd
df = pd.DataFrame(df_dict)

In [9]:
df.head()

Unnamed: 0,activity_id,assay_chembl_id,bao_endpoint,bao_format,bao_label,canonical_smiles,molecule_chembl_id,parent_molecule_chembl_id,pchembl_value,potential_duplicate,...,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id
0,5539969,CHEMBL1737963,BAO_0000190,BAO_0000019,assay format,COc1ccc(NC(=O)C#Cc2ccccc2)cc1,CHEMBL1336269,CHEMBL1336269,5.1,0,...,=,,IC50,nM,,8010.0,CHEMBL1741208,Homo sapiens,"NACHT, LRR and PYD domains-containing protein 3",9606
1,5539970,CHEMBL1737963,BAO_0000190,BAO_0000019,assay format,COc1ccc(-c2cc(=O)c3ccccc3o2)cc1,CHEMBL16312,CHEMBL16312,4.51,0,...,=,,IC50,nM,,31000.0,CHEMBL1741208,Homo sapiens,"NACHT, LRR and PYD domains-containing protein 3",9606
2,5539971,CHEMBL1737963,BAO_0000190,BAO_0000019,assay format,Br.CCCCCCCCCn1c2c(c(=N)c3c1CCC3)CCC2,CHEMBL1596681,CHEMBL1625618,5.04,0,...,=,,IC50,nM,,9020.0,CHEMBL1741208,Homo sapiens,"NACHT, LRR and PYD domains-containing protein 3",9606
3,5539972,CHEMBL1737963,BAO_0000190,BAO_0000019,assay format,CCCC1CCC(c2ccc(OCC)cc2)=CC1=O,CHEMBL1447078,CHEMBL1447078,4.27,0,...,=,,IC50,nM,,53900.0,CHEMBL1741208,Homo sapiens,"NACHT, LRR and PYD domains-containing protein 3",9606
4,5539973,CHEMBL1737963,BAO_0000190,BAO_0000019,assay format,Cc1nc2ccc(Nc3nc4cc(S(=O)(=O)N5CCOCC5)ccc4o3)cc2s1,CHEMBL1531200,CHEMBL1531200,,0,...,>,,IC50,nM,,100000.0,CHEMBL1741208,Homo sapiens,"NACHT, LRR and PYD domains-containing protein 3",9606


In [11]:
df.to_csv('/content/drive/MyDrive/datacon2025/main/nlrp/nlrp_raw_df.csvb')

In [12]:
df['standard_type'].value_counts()

Unnamed: 0_level_0,count
standard_type,Unnamed: 1_level_1
IC50,843
Inhibition,393
EC50,62
Activity,31
AC50,20
Kd,18
Ratio IC50,10
IC90,9
IC70,7
K,4


In [13]:
ic50_df = df[df['standard_type'] == 'IC50']
ic50_df.shape

(843, 22)

In [14]:
ic50_df['target_organism'].value_counts()

Unnamed: 0_level_0,count
target_organism,Unnamed: 1_level_1
Homo sapiens,843


In [15]:
ic50_df = ic50_df.drop(['target_organism'], axis = 1)


In [16]:
ic50_df['target_chembl_id'].value_counts()

Unnamed: 0_level_0,count
target_chembl_id,Unnamed: 1_level_1
CHEMBL1741208,843


In [17]:
ic50_df = ic50_df.drop(['target_chembl_id', 'target_pref_name', 'target_tax_id'], axis = 1)

In [18]:
ic50_df['standard_relation'].value_counts()

Unnamed: 0_level_0,count
standard_relation,Unnamed: 1_level_1
=,613
>,182
<=,25
<,21


In [19]:
ic50_df = ic50_df[ic50_df['standard_relation'] == '=']


In [20]:
ic50_df = ic50_df.drop(['relation', 'standard_relation'], axis = 1)


In [21]:
ic50_df['bao_label'].value_counts()

Unnamed: 0_level_0,count
bao_label,Unnamed: 1_level_1
cell-based format,364
assay format,161
tissue-based format,71
single protein format,17


In [None]:
#ic50_df = ic50_df[ic50_df['bao_label'] == 'protein complex format']
#ic50_df.shape

(1156, 16)

In [22]:
ic50_df = ic50_df.drop(['bao_label', 'bao_format', 'bao_endpoint'], axis = 1)
ic50_df.shape

(613, 13)

In [23]:
ic50_df['standard_units'].value_counts()

Unnamed: 0_level_0,count
standard_units,Unnamed: 1_level_1
nM,613


In [24]:
ic50_df = ic50_df.drop(['standard_upper_value', 'standard_text_value', 'standard_flag'], axis = 1)
ic50_df.columns

Index(['activity_id', 'assay_chembl_id', 'canonical_smiles',
       'molecule_chembl_id', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'standard_type', 'standard_units',
       'standard_value'],
      dtype='object')

In [25]:
ic50_df['molecule_chembl_id'].value_counts()

Unnamed: 0_level_0,count
molecule_chembl_id,Unnamed: 1_level_1
CHEMBL3183703,14
CHEMBL5219789,11
CHEMBL4780624,6
CHEMBL4752523,4
CHEMBL4750616,3
...,...
CHEMBL5180066,1
CHEMBL3956814,1
CHEMBL5187589,1
CHEMBL5207889,1


In [26]:
ic50_df['standard_value'] = ic50_df['standard_value'].astype(float)
ic50_df['pchembl_value'] = ic50_df['pchembl_value'].astype(float)

In [27]:
final_df = ic50_df.groupby('molecule_chembl_id').agg({
    'canonical_smiles': 'first',
    'standard_value': 'median',
    'pchembl_value': 'median'
})

final_df

Unnamed: 0_level_0,canonical_smiles,standard_value,pchembl_value
molecule_chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CHEMBL1076347,O=C(Nc1ccc(Cl)cc1)Nc1ccc(Cl)c(Cl)c1,1147.5,5.960
CHEMBL1256359,CC(N)C(=O)OC(C)(C)Cc1ccc(Cl)cc1.Cl,6160.0,5.235
CHEMBL129795,O=C(/C=C/c1ccc(O)cc1)c1ccc(O)cc1O,5820.0,5.405
CHEMBL1313152,c1ccc(OCc2nnnn2-c2ccccc2)cc1,6000.0,5.220
CHEMBL1329507,O=c1c2ccc(Cl)cc2nc(-c2cccc(C(F)(F)F)c2)n1O,6835.0,5.265
...,...,...,...
CHEMBL5440398,CC(C)c1nn(Cc2nnc(C3CCOC3)o2)c(=O)c2cc3sc(Cl)cc...,370.0,6.430
CHEMBL5440660,Cc1ccc(C(F)(F)F)cc1Nc1nc(C(=O)NS(=O)(=O)N2CCOC...,4019.0,6.260
CHEMBL5440833,Cc1ccc(C2CCC2)cc1Nc1nc(C(=O)NS(=O)(=O)N(C)C)co1,310.0,6.510
CHEMBL601543,CCN1/C(=C/C=C/c2oc3ccc(-c4ccccc4)cc3[n+]2CC)Oc...,9000.0,5.045


In [28]:
import numpy as np

def pchembl_from_nM(nM_value):
    molar = nM_value * 1e-9
    return -np.log10(molar)

In [29]:
final_df['log_ic50(pchembl)'] = pchembl_from_nM(final_df['standard_value'])
final_df

Unnamed: 0_level_0,canonical_smiles,standard_value,pchembl_value,log_ic50(pchembl)
molecule_chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CHEMBL1076347,O=C(Nc1ccc(Cl)cc1)Nc1ccc(Cl)c(Cl)c1,1147.5,5.960,5.940247
CHEMBL1256359,CC(N)C(=O)OC(C)(C)Cc1ccc(Cl)cc1.Cl,6160.0,5.235,5.210419
CHEMBL129795,O=C(/C=C/c1ccc(O)cc1)c1ccc(O)cc1O,5820.0,5.405,5.235077
CHEMBL1313152,c1ccc(OCc2nnnn2-c2ccccc2)cc1,6000.0,5.220,5.221849
CHEMBL1329507,O=c1c2ccc(Cl)cc2nc(-c2cccc(C(F)(F)F)c2)n1O,6835.0,5.265,5.165261
...,...,...,...,...
CHEMBL5440398,CC(C)c1nn(Cc2nnc(C3CCOC3)o2)c(=O)c2cc3sc(Cl)cc...,370.0,6.430,6.431798
CHEMBL5440660,Cc1ccc(C(F)(F)F)cc1Nc1nc(C(=O)NS(=O)(=O)N2CCOC...,4019.0,6.260,5.395882
CHEMBL5440833,Cc1ccc(C2CCC2)cc1Nc1nc(C(=O)NS(=O)(=O)N(C)C)co1,310.0,6.510,6.508638
CHEMBL601543,CCN1/C(=C/C=C/c2oc3ccc(-c4ccccc4)cc3[n+]2CC)Oc...,9000.0,5.045,5.045757


In [30]:
final_df = final_df.drop(['standard_value', 'pchembl_value'], axis = 1)
final_df

Unnamed: 0_level_0,canonical_smiles,log_ic50(pchembl)
molecule_chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
CHEMBL1076347,O=C(Nc1ccc(Cl)cc1)Nc1ccc(Cl)c(Cl)c1,5.940247
CHEMBL1256359,CC(N)C(=O)OC(C)(C)Cc1ccc(Cl)cc1.Cl,5.210419
CHEMBL129795,O=C(/C=C/c1ccc(O)cc1)c1ccc(O)cc1O,5.235077
CHEMBL1313152,c1ccc(OCc2nnnn2-c2ccccc2)cc1,5.221849
CHEMBL1329507,O=c1c2ccc(Cl)cc2nc(-c2cccc(C(F)(F)F)c2)n1O,5.165261
...,...,...
CHEMBL5440398,CC(C)c1nn(Cc2nnc(C3CCOC3)o2)c(=O)c2cc3sc(Cl)cc...,6.431798
CHEMBL5440660,Cc1ccc(C(F)(F)F)cc1Nc1nc(C(=O)NS(=O)(=O)N2CCOC...,5.395882
CHEMBL5440833,Cc1ccc(C2CCC2)cc1Nc1nc(C(=O)NS(=O)(=O)N(C)C)co1,6.508638
CHEMBL601543,CCN1/C(=C/C=C/c2oc3ccc(-c4ccccc4)cc3[n+]2CC)Oc...,5.045757


In [31]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [32]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

final_df["canonical_smiles"].apply(is_valid_smiles).sum() == final_df.shape[0]

np.True_

In [33]:
final_df

Unnamed: 0_level_0,canonical_smiles,log_ic50(pchembl)
molecule_chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
CHEMBL1076347,O=C(Nc1ccc(Cl)cc1)Nc1ccc(Cl)c(Cl)c1,5.940247
CHEMBL1256359,CC(N)C(=O)OC(C)(C)Cc1ccc(Cl)cc1.Cl,5.210419
CHEMBL129795,O=C(/C=C/c1ccc(O)cc1)c1ccc(O)cc1O,5.235077
CHEMBL1313152,c1ccc(OCc2nnnn2-c2ccccc2)cc1,5.221849
CHEMBL1329507,O=c1c2ccc(Cl)cc2nc(-c2cccc(C(F)(F)F)c2)n1O,5.165261
...,...,...
CHEMBL5440398,CC(C)c1nn(Cc2nnc(C3CCOC3)o2)c(=O)c2cc3sc(Cl)cc...,6.431798
CHEMBL5440660,Cc1ccc(C(F)(F)F)cc1Nc1nc(C(=O)NS(=O)(=O)N2CCOC...,5.395882
CHEMBL5440833,Cc1ccc(C2CCC2)cc1Nc1nc(C(=O)NS(=O)(=O)N(C)C)co1,6.508638
CHEMBL601543,CCN1/C(=C/C=C/c2oc3ccc(-c4ccccc4)cc3[n+]2CC)Oc...,5.045757


In [34]:
final_df.to_csv('/content/drive/MyDrive/datacon2025/main/nlrp/processed_data.csv')

## Feature exctraction

#### Rdkit descriptors

In [None]:
!pip install rdkit



In [35]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm.notebook import tqdm

In [36]:
descriptor_list = Descriptors.descList

descriptor_names = [name for name, func in descriptor_list]

def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    values = []
    for name, func in descriptor_list:
        try:
            val = func(mol)
        except:
            val = None
        values.append(val)
    return values

desc_values = final_df["canonical_smiles"].apply(calc_descriptors)
rdkit_df = pd.DataFrame(desc_values.tolist(), columns=descriptor_names)
rdkit_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,11.731737,11.731737,0.374341,-0.374341,0.776346,10.052632,315.587,306.515,313.978046,98,...,0,0,0,0,0,0,0,0,0,1
1,11.434439,11.434439,0.000000,-0.597692,0.867971,12.500000,292.206,273.054,291.079284,102,...,0,0,0,0,0,0,0,0,0,0
2,11.839243,11.839243,0.100404,-0.369838,0.582503,10.736842,256.257,244.161,256.073559,96,...,0,0,0,0,0,0,0,0,0,0
3,5.649199,5.649199,0.325305,0.325305,0.714224,10.315789,252.277,240.181,252.101111,94,...,0,0,0,0,1,0,0,0,0,0
4,12.800455,12.800455,0.040462,-4.544894,0.681841,11.826087,340.688,332.624,340.022640,118,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,13.064597,13.064597,0.108805,-0.201044,0.503293,17.500000,419.894,401.750,419.081888,146,...,0,0,0,0,0,0,0,1,0,0
441,12.870904,12.870904,0.078307,-4.528154,0.740452,15.862069,434.396,417.260,434.087175,158,...,0,0,0,0,0,0,0,0,0,0
442,12.020118,12.020118,0.119699,-3.881866,0.800313,14.923077,378.454,356.278,378.136176,140,...,0,0,0,0,0,0,0,0,0,0
443,9.453472,9.453472,0.091435,-4.424190,0.103111,13.636364,610.732,576.460,610.213758,226,...,0,0,0,0,0,0,0,0,0,0


In [37]:
nans = rdkit_df.isna().sum()
nans[nans > 0]

Unnamed: 0,0
MaxPartialCharge,3
MinPartialCharge,3
MaxAbsPartialCharge,3
MinAbsPartialCharge,3
BCUT2D_MWHI,7
BCUT2D_MWLOW,7
BCUT2D_CHGHI,7
BCUT2D_CHGLO,7
BCUT2D_LOGPHI,7
BCUT2D_LOGPLOW,7


In [None]:
nans = rdkit_df.isna().sum().to_dict()
nan_features = []
for key in nans:
  if nans[key] >0:
    nan_features.append(key)

rdkit_df = rdkit_df.fillna(rdkit_df.median())
(rdkit_df.isna().sum() > 0).sum()

np.int64(0)

In [None]:
nans = rdkit_df.isna().sum()
nans[nans > 0]

Unnamed: 0,0


In [38]:
zero_var_cols = [c for c in rdkit_df.columns if rdkit_df[c].nunique() <= 1]
zero_var_cols

['NumRadicalElectrons',
 'SMR_VSA8',
 'SlogP_VSA9',
 'fr_SH',
 'fr_azide',
 'fr_barbitur',
 'fr_benzodiazepine',
 'fr_diazo',
 'fr_dihydropyridine',
 'fr_isocyan',
 'fr_isothiocyan',
 'fr_lactam',
 'fr_nitroso',
 'fr_phos_acid',
 'fr_phos_ester',
 'fr_prisulfonamd',
 'fr_quatN',
 'fr_term_acetylene',
 'fr_thiocyan']

In [39]:
rdkit_df = rdkit_df.drop(zero_var_cols, axis = 1)
len(rdkit_df.columns)

198

In [40]:
import numpy as np

corr_matrix = rdkit_df.corr().abs()
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

to_drop = [
    column for column in upper.columns if any(upper[column] > 0.95)
]

to_drop

['MaxEStateIndex',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'FpDensityMorgan3',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'LabuteASA',
 'HeavyAtomCount',
 'NumHDonors',
 'Phi',
 'MolMR',
 'fr_COO2',
 'fr_C_O_noCOO',
 'fr_Nhpyrrole',
 'fr_amide',
 'fr_benzene',
 'fr_nitrile',
 'fr_nitro_arom',
 'fr_phenol_noOrthoHbond']

In [41]:
rdkit_df = rdkit_df.drop(to_drop, axis = 1)
len(rdkit_df.columns)

168

In [42]:
rdkit_df.to_csv('/content/drive/MyDrive/datacon2025/main/nlrp/rdkit_descriptors.csv')

#### MACCS fingerprints

In [43]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import numpy as np
from tqdm import tqdm

def compute_maccs_fingerprints(smiles_list):
    fps_array = []

    for smi in tqdm(smiles_list, desc="MACCS fingerprints"):
        mol = Chem.MolFromSmiles(smi)
        fp = MACCSkeys.GenMACCSKeys(mol)
        arr = np.zeros((fp.GetNumBits(),), dtype=int)
        Chem.DataStructs.ConvertToNumpyArray(fp, arr)
        fps_array.append(arr)

    fps_df = pd.DataFrame(fps_array, columns=[f"maccs_{i}" for i in range(167)])

    return fps_df


maccs_fps = compute_maccs_fingerprints(final_df["canonical_smiles"])

MACCS fingerprints: 100%|██████████| 445/445 [00:00<00:00, 612.57it/s]


In [44]:
maccs_fps.shape

(445, 167)

In [45]:
maccs_fps.head()

Unnamed: 0,maccs_0,maccs_1,maccs_2,maccs_3,maccs_4,maccs_5,maccs_6,maccs_7,maccs_8,maccs_9,...,maccs_157,maccs_158,maccs_159,maccs_160,maccs_161,maccs_162,maccs_163,maccs_164,maccs_165,maccs_166
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,1,1,1,0


In [46]:
maccs_fps.to_csv('/content/drive/MyDrive/datacon2025/main/nlrp/maccs_fingerprints.csv')

## Catboost

In [47]:
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [48]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [49]:
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [50]:
all_features = pd.concat([rdkit_df, maccs_fps], axis = 1)

In [51]:
X = all_features
y = final_df['log_ic50(pchembl)']
X.shape, y.shape

((445, 335), (445,))

In [52]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [53]:
import optuna
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import numpy as np



def objective(trial):

    params = {
        "iterations": 500,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 0, 10),
        "rsm": trial.suggest_float("rsm", 0.5, 1.0),
        "loss_function": "RMSE",
        "early_stopping_rounds": 30,
        "verbose": False,
        "task_type": "CPU",
        "random_seed": 42
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    r2_scores = []

    for train_idx, valid_idx in kf.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train, y_train)
        valid_pool = Pool(X_valid, y_valid)

        model = CatBoostRegressor(**params)
        model.fit(train_pool, eval_set=valid_pool)

        y_pred = model.predict(X_valid)
        r2 = r2_score(y_valid, y_pred)
        r2_scores.append(r2)

    return -np.mean(r2_scores)


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

print("Лучшие параметры:")
print(study.best_params)

print("Лучший средний R2:")
print(-study.best_value)


[I 2025-07-15 14:39:21,388] A new study created in memory with name: no-name-a11db918-23df-4310-9a8a-5012058992d5
[I 2025-07-15 14:42:07,231] Trial 0 finished with value: -0.6321601610064282 and parameters: {'learning_rate': 0.11535210216629364, 'depth': 9, 'l2_leaf_reg': 2.9076617490908396, 'bagging_temperature': 0.7719291664298465, 'border_count': 175, 'random_strength': 6.980694157147125, 'rsm': 0.8818432799873319}. Best is trial 0 with value: -0.6321601610064282.
[I 2025-07-15 14:42:48,143] Trial 1 finished with value: -0.6455329981253131 and parameters: {'learning_rate': 0.10442472615020289, 'depth': 7, 'l2_leaf_reg': 2.7982122904227045, 'bagging_temperature': 0.9525076264907741, 'border_count': 202, 'random_strength': 7.101029275821747, 'rsm': 0.7536513774794801}. Best is trial 1 with value: -0.6455329981253131.
[I 2025-07-15 14:43:16,179] Trial 2 finished with value: -0.6506724885852121 and parameters: {'learning_rate': 0.07337613639241375, 'depth': 6, 'l2_leaf_reg': 7.834450319

Лучшие параметры:
{'learning_rate': 0.042245782398298806, 'depth': 5, 'l2_leaf_reg': 2.020753233867438, 'bagging_temperature': 0.016208698976586902, 'border_count': 102, 'random_strength': 5.422447598618284, 'rsm': 0.8234386067355639}
Лучший средний R2:
0.6687404931710559


In [54]:
import json
json.dump(
    study.best_params,
    open('/content/drive/MyDrive/datacon2025/main/nlrp/catboost_best_params.json', 'w')
)

In [55]:
model = CatBoostRegressor(**study.best_params)
model.fit(X, y)

0:	learn: 1.2183380	total: 8.02ms	remaining: 8.01s
1:	learn: 1.2048604	total: 16.6ms	remaining: 8.29s
2:	learn: 1.1953294	total: 24.4ms	remaining: 8.12s
3:	learn: 1.1849027	total: 31.7ms	remaining: 7.9s
4:	learn: 1.1771594	total: 39.2ms	remaining: 7.8s
5:	learn: 1.1630296	total: 47ms	remaining: 7.78s
6:	learn: 1.1484094	total: 54.7ms	remaining: 7.76s
7:	learn: 1.1414112	total: 62.5ms	remaining: 7.75s
8:	learn: 1.1397353	total: 64.6ms	remaining: 7.12s
9:	learn: 1.1293952	total: 72.4ms	remaining: 7.17s
10:	learn: 1.1213099	total: 80.7ms	remaining: 7.26s
11:	learn: 1.1091994	total: 88.3ms	remaining: 7.27s
12:	learn: 1.0988858	total: 96.6ms	remaining: 7.33s
13:	learn: 1.0879408	total: 104ms	remaining: 7.34s
14:	learn: 1.0798265	total: 112ms	remaining: 7.32s
15:	learn: 1.0705355	total: 119ms	remaining: 7.3s
16:	learn: 1.0629284	total: 126ms	remaining: 7.28s
17:	learn: 1.0512539	total: 136ms	remaining: 7.44s
18:	learn: 1.0425827	total: 144ms	remaining: 7.43s
19:	learn: 1.0335702	total: 151ms

<catboost.core.CatBoostRegressor at 0x7ddfb93b8f10>

In [56]:
model.save_model("/content/drive/MyDrive/datacon2025/main/nlrp/catboost_nlrp_final.json", format="json")

## Checking candidates

In [107]:
from rdkit import Chem
from rdkit.Contrib.SA_Score import sascorer
from rdkit.Chem import FilterCatalog
from rdkit.Chem.FilterCatalog import FilterCatalogParams
from rdkit.Chem import Descriptors
from IPython.display import display
from rdkit.Chem import Draw

params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.BRENK)
catalog = FilterCatalog.FilterCatalog(params)
#df = concat_mols()


# Функция проверки по 4 критериям
def checker(df):
  dfs=df.copy()
  indexes=[]
  for index, row in dfs.iterrows():

    mol = Chem.MolFromSmiles(dfs['smiles'][index])
    MW = Descriptors.MolWt(mol)
    HBA = Descriptors.NOCount(mol)
    HBD = Descriptors.NHOHCount(mol)
    LogP = Descriptors.MolLogP(mol)
    print(Chem.QED.qed(mol))
    if (dfs['pValue'][index] < 3) or (Chem.QED.qed(mol) < 0.5) or ((sascorer.calculateScore(mol) < 6) and (sascorer.calculateScore(mol) > 2)) or (catalog.HasMatch(mol) == True) or ((MW <= 500 and LogP <= 5 and HBD <= 5 and HBA <= 10) == False):
      indexes.append(index)
  df = dfs.drop(indexes)
  #print(df)

  # for index, row in df.iterrows():
  #   mol = Chem.MolFromSmiles(dfs['Smiles'][index])
  #   img = Draw.MolToImage(mol, size=(300,300))
  #   display(img)

  return df
  # df.to_csv('/content/selected_hits.csv',
  #             #sep = ';',
  #             index = False)

In [58]:
nlrp = pd.read_csv('/content/drive/MyDrive/datacon2025/main/potential_candidates/hash_ligand_mapping_NLRP3.csv')
nlrp

Unnamed: 0,hash,smiles
0,aada5de3df097a15b722abad1edd92608344b2f2,Cc1ccc(cc1)-n1c(C)c(C2=NNC(=O)Cc3ccccc23)c2ccc...
1,b5c4d7a71ce3e5123b629cab31fb2e9e4d07ac9b,COc1ccc(cc1OC)S(=O)(=O)NC(=O)Nc1c2CCCc2cc2CCCc12
2,fcd0946433c09ce6604e8a6a2fe518fe695e2fb2,Oc1ccc2oc(=O)c3ccccc3c2c1
3,219f31f17bf870cf9c58d401351d4b5533426df6,CN(C(=O)Oc1ccc(Cl)cc1)c1ccc(cc1)S(C)(=O)=O
4,07f51c23b292bb32de34c5ee134f35bb60abced0,CC(=O)c1ccc(NC(=O)C(CC(=O)c2ccc(F)cc2)NC(=O)c2...
...,...,...
509,219df51a2ff3bf75c9bebdd3de462dbe45642914,CN(C)c1ccc(cc1)C(=O)NNC(=O)c1ccc(Cl)cc1
510,53dd7b50f1c014c6055eef205ba842cea610729c,COc1cccc(NC(=O)Nc2ccc(cc2)-c2csc(n2)N2CCCCC2)c1
511,56a54c95f9021bb17b804c9142ff32f1ebb7402e,COc1ccc(cc1)C(=O)NNC(=O)c1ccccc1Br
512,8eedd7bc848d425e46eb4ede26ca6dc2d9ffbc61,COc1ccccc1\C=C1/SC(=S)N(CCC(=O)NNC(=O)c2ccccc2...


In [65]:
nlrp = nlrp[nlrp["smiles"].apply(is_valid_smiles)]

[18:04:12] Can't kekulize mol.  Unkekulized atoms: 6 7 10 11 16 18 19 20 21
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 8 9 10
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 18 19 20 22 23 24 25 26 27
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 10 12 13 14 15 16 17
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 18
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 10 11 12 13 14
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 7 8 19 20 22 24 25
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 10 11 12 14 16 20 21
[18:04:12] Can't kekulize mol.  Unkekulized atoms: 1 2 4 5 8 23 24


In [66]:
values = nlrp["smiles"].apply(calc_descriptors)
candidates_rdkit = pd.DataFrame(values.tolist(), columns=descriptor_names)
candidates_rdkit

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.206374,12.206374,0.090393,-0.090393,0.540394,13.586207,379.463,358.295,379.168462,142,...,0,0,0,0,0,0,0,0,0,0
1,12.699768,12.699768,0.075167,-4.062925,0.780773,14.827586,416.499,392.307,416.140593,154,...,0,1,0,0,0,0,0,0,0,1
2,11.648450,11.648450,0.158798,-0.352972,0.460042,11.000000,212.204,204.140,212.047344,78,...,0,0,0,0,0,0,0,0,0,0
3,12.032106,12.032106,0.190480,-3.265358,0.859558,11.045455,339.800,325.688,339.033207,116,...,0,0,1,0,0,0,0,0,0,0
4,13.496456,13.496456,0.020516,-1.300181,0.503930,11.363636,450.441,430.281,450.139114,168,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,11.955161,11.955161,0.381834,-0.407474,0.854985,9.954545,317.776,301.648,317.093104,114,...,0,0,0,0,0,0,0,0,0,0
500,12.235558,12.235558,0.299031,-0.299031,0.588819,13.758621,408.527,384.335,408.161997,150,...,0,0,0,0,0,1,0,0,0,1
501,11.925877,11.925877,0.398762,-0.404577,0.837221,9.809524,349.184,336.080,348.010954,108,...,0,0,0,0,0,0,0,0,0,0
502,12.700294,12.700294,0.028905,-0.655738,0.347411,14.612903,457.533,438.381,457.076613,160,...,1,0,0,0,0,0,0,0,0,0


In [68]:
candidates_rdkit = candidates_rdkit[rdkit_df.columns]

In [69]:
maccs_fps = compute_maccs_fingerprints(nlrp["smiles"])

MACCS fingerprints: 100%|██████████| 504/504 [00:00<00:00, 784.78it/s]


In [70]:
candidates_features = pd.concat([candidates_rdkit, maccs_fps], axis = 1)

In [76]:
pvalues = model.predict(candidates_features)
pvalues

array([5.5756465 , 6.73918373, 5.1475951 , 5.08003345, 6.00845628,
       5.70392368, 5.34477464, 6.68088811, 5.95267599, 5.86245371,
       5.70868383, 5.70931478, 5.95714993, 4.9622699 , 7.5749747 ,
       5.78947518, 5.48286515, 5.38659826, 6.16573167, 5.65509036,
       5.21114809, 5.28354377, 5.95728072, 6.74907214, 5.04086101,
       5.45597302, 5.00576302, 5.09091532, 5.3821243 , 5.90790476,
       4.89269278, 5.95301166, 5.65773754, 5.69753051, 5.30604651,
       5.68508361, 6.11568562, 6.19085497, 5.97501843, 6.48014473,
       5.20326002, 5.65064593, 5.44939819, 6.99559995, 5.18505052,
       6.08448512, 5.3706647 , 4.92470868, 5.91529807, 5.98723208,
       7.57516717, 6.21896407, 6.21628579, 6.01217316, 5.51968939,
       5.48294865, 5.58804919, 5.48670825, 5.02648309, 5.75363752,
       5.20841679, 5.99504245, 5.51044128, 7.94011101, 6.34149521,
       6.05258653, 5.40198636, 5.5021768 , 5.57628798, 5.57338518,
       5.65257768, 5.36128347, 5.58225496, 5.57460422, 4.72976

In [74]:
nlrp['pValue'] = pvalues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nlrp['pValue'] = pvalues


In [75]:
nlrp

Unnamed: 0,hash,smiles,pValue
0,aada5de3df097a15b722abad1edd92608344b2f2,Cc1ccc(cc1)-n1c(C)c(C2=NNC(=O)Cc3ccccc23)c2ccc...,5.575647
1,b5c4d7a71ce3e5123b629cab31fb2e9e4d07ac9b,COc1ccc(cc1OC)S(=O)(=O)NC(=O)Nc1c2CCCc2cc2CCCc12,6.739184
2,fcd0946433c09ce6604e8a6a2fe518fe695e2fb2,Oc1ccc2oc(=O)c3ccccc3c2c1,5.147595
3,219f31f17bf870cf9c58d401351d4b5533426df6,CN(C(=O)Oc1ccc(Cl)cc1)c1ccc(cc1)S(C)(=O)=O,5.080033
4,07f51c23b292bb32de34c5ee134f35bb60abced0,CC(=O)c1ccc(NC(=O)C(CC(=O)c2ccc(F)cc2)NC(=O)c2...,6.008456
...,...,...,...
509,219df51a2ff3bf75c9bebdd3de462dbe45642914,CN(C)c1ccc(cc1)C(=O)NNC(=O)c1ccc(Cl)cc1,5.054702
510,53dd7b50f1c014c6055eef205ba842cea610729c,COc1cccc(NC(=O)Nc2ccc(cc2)-c2csc(n2)N2CCCCC2)c1,5.810937
511,56a54c95f9021bb17b804c9142ff32f1ebb7402e,COc1ccc(cc1)C(=O)NNC(=O)c1ccccc1Br,5.405280
512,8eedd7bc848d425e46eb4ede26ca6dc2d9ffbc61,COc1ccccc1\C=C1/SC(=S)N(CCC(=O)NNC(=O)c2ccccc2...,5.970700


#### Checking on blood brain barrier permeability

In [77]:
!git clone https://github.com/12rajnish/DeePred-BBB

Cloning into 'DeePred-BBB'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 61 (delta 5), reused 2 (delta 2), pack-reused 49 (from 1)[K
Receiving objects: 100% (61/61), 67.82 MiB | 32.54 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [92]:
nlrp[['smiles','hash']].to_csv(
    'smiles.smi',
    sep = ' ',
    header=False,
    index = False
    )

In [93]:
!python /content/DeePred-BBB/DeePred-BBB_Script.py /content/DeePred-BBB/

2025-07-15 18:36:35.102805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752604595.157271   60404 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752604595.179301   60404 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-15 18:36:35.242750: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Processing b5c4d7a71ce3e5123b629cab31fb2e9e4d07ac9b in smiles.smi (2/504). 
Processing aada5de3df097a15b722abad1edd92608344b2

In [95]:
preds = pd.read_csv('/content/DeePred-BBB_predictions.csv')
preds

Unnamed: 0,Name,Predicted_class
0,b5c4d7a71ce3e5123b629cab31fb2e9e4d07ac9b,0
1,aada5de3df097a15b722abad1edd92608344b2f2,0
2,fcd0946433c09ce6604e8a6a2fe518fe695e2fb2,-9223372036854775808
3,219f31f17bf870cf9c58d401351d4b5533426df6,0
4,efaee03358cadb8d17e5450001851674ed681d9b,0
...,...,...
499,53dd7b50f1c014c6055eef205ba842cea610729c,1
500,56a54c95f9021bb17b804c9142ff32f1ebb7402e,0
501,895ecf3091897f21f001d5dadf3d8db1fa130832,1
502,608b83f03e78031e21878c1c990bbcf7b1d35d90,0


In [98]:
preds = preds[(preds['Predicted_class'] == 1)]

In [100]:
preds['Name']

Unnamed: 0,Name
6,bcf34e2677a2eaca1c50798d452c5403c242a981
9,90db8181067a8f581ef562000ddb150f123100e1
13,eb429f3cee439e995e9daf544da7e6ae4f81e166
14,ab90f1ce78c6a137fe64e1fa5aedcb8988325c8c
17,d0b28b5e7d6da3d72c1311b25fe6f256944fb472
...,...
491,f59e9642598f203a74b96640859a695673b110d8
492,70eb72c9893da434633abcd00208330c89a72371
494,e65ba27d3c8fc00507d0e7dfe7cdbff7aa9fb94f
499,53dd7b50f1c014c6055eef205ba842cea610729c


In [102]:
filtered_df = nlrp[nlrp["hash"].isin(preds['Name'])]

In [103]:
filtered_df

Unnamed: 0,hash,smiles,pValue
6,bcf34e2677a2eaca1c50798d452c5403c242a981,Cc1cccc(C)c1NC(=O)NNC(=O)c1cccc(Cl)c1,5.344775
8,90db8181067a8f581ef562000ddb150f123100e1,O=C(NC(Cc1ccccc1)c1nc2ccccc2s1)C1COc2ccccc2O1,5.952676
13,eb429f3cee439e995e9daf544da7e6ae4f81e166,CN(CCc1ccccc1)C(=O)c1ccc(\C=C\c2ccccc2)o1,4.962270
14,ab90f1ce78c6a137fe64e1fa5aedcb8988325c8c,Cc1ccc(cc1)S(=O)(=O)NC(=O)Nc1c2CCCc2cc2CCCc12,7.574975
17,d0b28b5e7d6da3d72c1311b25fe6f256944fb472,OC(=O)c1ccc(NC(=O)c2ccc3OCOc3c2)cc1,5.386598
...,...,...,...
501,f59e9642598f203a74b96640859a695673b110d8,Cc1cccc(c1)C(=O)Nc1c(Cl)cccc1Cl,5.118660
502,70eb72c9893da434633abcd00208330c89a72371,Clc1cccc(Cl)c1NC(=O)Nc1nc2ccc(Br)cc2s1,6.039330
504,e65ba27d3c8fc00507d0e7dfe7cdbff7aa9fb94f,COc1ccc(cc1)-c1nc(c([nH]1)-c1ccc(C)cc1)-c1ccc(...,5.610624
507,895ecf3091897f21f001d5dadf3d8db1fa130832,O=C(Nc1ccc(Oc2ccc(cc2)N2CCOCC2)cc1)N1CCN(CC1)c...,5.665507


In [109]:
res = checker(filtered_df)

0.7426832387440333
0.51905502298702
0.6520375976313363
0.8684482635419041
0.903913657347212
0.8951224698338736
0.5668318057360998
0.5677492098562402
0.6489939423470926
0.5437092925450275
0.7663176092796506
0.6222274217492244
0.8701137716432222
0.7956937739001606
0.445111875866635
0.4682366818704557
0.8434380855219724
0.7948821610995931
0.8122121308633956
0.7128566200917136
0.8623694086468746
0.7258711149849274
0.42122013853269646
0.707551279197823
0.729812149591905
0.7602076792062705
0.7466917550613765
0.7587196597370931
0.6390272416552967
0.5664097129226553
0.607721897899732
0.5832454059096172
0.5118689400435953
0.8444550148002752
0.29015285452995804
0.7694905576250685
0.830748254157345
0.8369624833510731
0.9096656555388372
0.8565498441534956
0.8234060903854569
0.59382164372544
0.9316725833323815
0.5691731681338436
0.9153810252447103
0.8774371156634749
0.564276547640864
0.5203845748170737
0.9105445598021218
0.7751881970349876
0.6523778104271619
0.6208675384371186
0.8313482355423247
0.

In [112]:
res.shape

(51, 3)

In [116]:
res.to_csv('/content/drive/MyDrive/datacon2025/main/nlrp/approved_molecules.csv')

In [122]:
res['pValue'].mean(), res['pValue'].max(), res['pValue'].min()

(np.float64(5.425367709989562), 5.987431852610538, 4.824079842854688)