# Download data

In [1]:
! gdown --id 18WwPBK-TOpeYAUpUG3Ov4lfbo0nnhK1o

Downloading...
From: https://drive.google.com/uc?id=18WwPBK-TOpeYAUpUG3Ov4lfbo0nnhK1o
To: /content/Acute Toxicity_mouse_intraperitoneal_LD50.csv
100% 6.86M/6.86M [00:00<00:00, 40.3MB/s]


In [4]:
import pandas as pd

In [5]:
data = pd.read_csv("/content/Acute Toxicity_mouse_intraperitoneal_LD50.csv")

# Download rdkit and mordred libraries

In [4]:
!pip install rdkit
!pip install mordred

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1
Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.* (from mordred)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloadin

# Import libraries

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors
import mordred

# Data verification

In [6]:
data.head()

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical SMILES,InChIKey,mouse_intraperitoneal_LD50
0,TOX-145,785,"benzene-1,4-diol",Oc1ccc(O)cc1,Oc1ccc(O)cc1,QIGBRXMKCJKVMJ-UHFFFAOYSA-N,3.041835
1,TOX-245,5453,tris(aziridin-1-yl)-sulfanylidene-lambda5-phos...,S=P(N1CC1)(N1CC1)N1CC1,S=P(N1CC1)(N1CC1)N1CC1,FOCVUCIESVLUNU-UHFFFAOYSA-N,4.235584
2,TOX-1273,727,"1,2,3,4,5,6-hexachlorocyclohexane",ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,JLYXXMFPNIAWKQ-UHFFFAOYSA-N,3.366732
3,TOX-1279,4091,"3-(diaminomethylidene)-1,1-dimethylguanidine",CN(C)C(=N)N=C(N)N,CN(C)C(=N)N=C(N)N,XZWYZXLIPXDOLR-UHFFFAOYSA-N,2.641604
4,TOX-1282,10364,2-methyl-5-propan-2-ylphenol,Cc1ccc(C(C)C)cc1O,Cc1ccc(C(C)C)cc1O,RECUKUPTGUEGMW-UHFFFAOYSA-N,3.311627


In [7]:
data.isna().sum()

Unnamed: 0,0
TAID,0
Pubchem CID,0
IUPAC Name,1919
SMILES,0
Canonical SMILES,0
InChIKey,38
mouse_intraperitoneal_LD50,0


In [8]:
data.shape

(35299, 7)

In [9]:
data['SMILES'].nunique()

35299

In [10]:
data['Canonical SMILES'].nunique()

35299

# Converting molecules from SMILES strings to RDKit objects

In [11]:
mol_dataset = data['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
mol = pd.DataFrame(mol_dataset)

# Сalculation of descriptors

In [12]:
calc = Calculator(descriptors, ignore_3D=True)
data = calc.pandas(mol_dataset)

  2%|▏         | 694/35299 [02:36<2:16:10,  4.24it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▏         | 732/35299 [02:47<3:29:34,  2.75it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 35299/35299 [1:38:22<00:00,  5.98it/s]


In [13]:
data= data.select_dtypes(exclude=[mordred.error.Missing])
data.head()

Unnamed: 0,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,0,0,9.924777,2.170086,4.340173,9.924777,1.240597,2.97973,2.74204,0.342755,...,0.0,8.463159,35.730685,110.036779,7.85977,62,7,36.0,38.0,1.833333
1,0,0,15.203358,2.601679,4.863481,15.203358,1.382123,3.544956,3.159945,0.287268,...,8.503703,9.891769,66.234321,189.048955,8.21952,139,18,68.0,88.0,2.25
2,0,0,14.601126,2.414214,4.828427,14.601126,1.216761,3.391683,3.200413,0.266701,...,0.0,9.542876,43.309911,287.860066,15.992226,174,21,60.0,72.0,2.666667
3,0,5,9.335326,2.116883,4.233766,9.335326,1.037258,3.028326,2.774515,0.308279,...,0.0,8.267962,36.359125,129.101445,6.455072,96,8,36.0,36.0,2.111111
4,0,0,13.152542,2.292456,4.584911,13.152542,1.195686,3.294652,3.103869,0.28217,...,0.0,9.071423,40.991311,150.104465,6.004179,152,14,52.0,58.0,2.472222


In [14]:
data.to_csv('/content/mordred_descriptors.csv')