In [18]:
import pandas as pd
import numpy as np

In [19]:
df = pd.read_csv(
    "/content/data.csv",
    sep=";",
    engine="python",
    on_bad_lines="skip"
)

print(df.shape)
df.head(6)

(29550, 48)


Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value
0,CHEMBL1573562,,,420.88,0.0,2.4,SID24405141,CC(C)c1nn2c(=O)cc(COC(=O)CNC(=O)c3ccccc3Cl)nc2s1,Potency,'=',...,CHEMBL1201862,7,PubChem BioAssays,,,,,,,0.631
1,CHEMBL1523269,,,303.79,0.0,2.11,SID24779084,CC(C)(C)c1oc(C(=O)O)cc1CN1CCOCC1.Cl,Potency,'=',...,CHEMBL1201862,7,PubChem BioAssays,,,,,,,89.1251
2,CHEMBL1506534,,,359.41,0.0,1.76,SID7966081,CN(CC(=O)NCc1ccco1)S(=O)(=O)c1cccc2cccnc12,Potency,'=',...,CHEMBL1201862,7,PubChem BioAssays,,,,,,,89.1251
3,CHEMBL1467135,,,228.3,0.0,2.25,SID858663,Cc1cc2c(cc1C)-c1nnnn1C(C)(C)C2,Potency,'=',...,CHEMBL1201862,7,PubChem BioAssays,,,,,,,25.1189
4,CHEMBL1523271,,,273.32,0.0,2.36,SID24782946,COc1ccc2nc3c(O)n4c(nc-3c2c1)SCC4,Potency,'=',...,CHEMBL1201862,7,PubChem BioAssays,,,,,,,0.631
5,CHEMBL1405511,,,333.39,0.0,2.82,SID4248711,COCCNC(=O)c1cc2c(-c3ccccc3F)nn(C)c2s1,Potency,'=',...,CHEMBL1201862,7,PubChem BioAssays,,,,,,,2.8184


Фильтрация по мишени AmpC

In [20]:
df = df[
    (df["Target ChEMBL ID"] == "CHEMBL2026") &
    (df["Target Name"].str.contains("Beta-lactamase", na=False))
].copy()

print("После фильтра по мишени:", df.shape)

После фильтра по мишени: (19316, 48)


Осмысленные активности

In [21]:
valid_types = ["IC50", "Ki", "Kd", "Potency", "pChEMBL Value"]

df = df[
    df["Standard Type"].isin(valid_types) |
    df["pChEMBL Value"].notna()
].copy()

print("После фильтра активности:", df.shape)

После фильтра активности: (19250, 48)


Очистка SMILES и парсинг RDKit

In [22]:
!pip install rdkit



In [23]:
from rdkit import Chem
from rdkit.Chem import Descriptors

In [24]:
def smiles_to_mol(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except:
        return None

df["mol"] = df["Smiles"].apply(smiles_to_mol)

df = df[df["mol"].notna()].copy()
print("После удаления битых SMILES:", df.shape)

После удаления битых SMILES: (19245, 49)


Удаление нереалистичных соединений

In [25]:
df["MW"] = df["mol"].apply(Descriptors.MolWt)

df = df[
    (df["MW"] > 100) &
    (df["MW"] < 800)
].copy()

print("После фильтра MW:", df.shape)

После фильтра MW: (19226, 50)


Удаление нестабильных и реактивных структур

In [26]:
def unstable_structure(mol):
    patterns = [
        "[N+](=O)[O-]",        # нитросоединения
        "[SH]",               # свободные тиолы
        "[S-]",               # тиолаты
        "C=C=C",              # кумулированные системы
        "[O-][N+](=O)",       # нитраты
        "[Cl,Br,I][Cl,Br,I]"  # галоген-галоген
    ]
    for p in patterns:
        if mol.HasSubstructMatch(Chem.MolFromSmarts(p)):
            return True
    return False

df = df[~df["mol"].apply(unstable_structure)].copy()
print("После удаления нестабильных структур:", df.shape)

После удаления нестабильных структур: (18018, 50)


Удаление токсофоров

In [27]:
toxic_smarts = [
    "c1ccc(cc1)[N+](=O)[O-]",   # ароматические нитро
    "N=N",                      # азосоединения
    "[Hg,Pb,Cd]",               # тяжёлые металлы
    "[Se]",                     # селен
    "[As]"                      # мышьяк
]

def has_toxicophore(mol):
    for s in toxic_smarts:
        if mol.HasSubstructMatch(Chem.MolFromSmarts(s)):
            return True
    return False

df = df[~df["mol"].apply(has_toxicophore)].copy()
print("После удаления токсофоров:", df.shape)

После удаления токсофоров: (17969, 50)


drug-likeness фильтрация

In [28]:
df["logP"] = df["mol"].apply(Descriptors.MolLogP)
df["HBD"] = df["mol"].apply(Descriptors.NumHDonors)
df["HBA"] = df["mol"].apply(Descriptors.NumHAcceptors)
df["RB"]  = df["mol"].apply(Descriptors.NumRotatableBonds)

df = df[
    (df["logP"] < 6) &
    (df["HBD"] <= 5) &
    (df["HBA"] <= 10) &
    (df["RB"] <= 10)
].copy()

print("После drug-likeness фильтра:", df.shape)

После drug-likeness фильтра: (17495, 54)


In [30]:
final_cols = [
    "Molecule ChEMBL ID",
    "Smiles",
    "Standard Type",
    "Standard Value",
    "Standard Units",
    "pChEMBL Value",
    "MW", "logP", "HBD", "HBA", "RB"
]

df_final = df[final_cols].copy()
df_final.reset_index(drop=True, inplace=True)

print("Финальный датасет:", df_final.shape)
df_final.head()

Финальный датасет: (17495, 11)


Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Value,Standard Units,pChEMBL Value,MW,logP,HBD,HBA,RB
0,CHEMBL1573562,CC(C)c1nn2c(=O)cc(COC(=O)CNC(=O)c3ccccc3Cl)nc2s1,Potency,631.0,nM,6.2,420.878,2.401,1,8,6
1,CHEMBL1523269,CC(C)(C)c1oc(C(=O)O)cc1CN1CCOCC1.Cl,Potency,89125.1,nM,4.05,303.786,2.5293,1,4,3
2,CHEMBL1506534,CN(CC(=O)NCc1ccco1)S(=O)(=O)c1cccc2cccnc12,Potency,89125.1,nM,4.05,359.407,1.7647,1,5,6
3,CHEMBL1467135,Cc1cc2c(cc1C)-c1nnnn1C(C)(C)C2,Potency,25118.9,nM,4.6,228.299,2.24814,0,4,0
4,CHEMBL1523271,COc1ccc2nc3c(O)n4c(nc-3c2c1)SCC4,Potency,631.0,nM,6.2,273.317,2.356,1,6,1


In [36]:
cols_participants = [
    "Molecule ChEMBL ID",
    "Smiles",
    "Standard Type",
    "Standard Value",
    "Standard Units",
    "pChEMBL Value"
]

df_participants = df[cols_participants].copy()
df_participants.reset_index(drop=True, inplace=True)

print("Датасет для участников:", df_participants.shape)

df_participants.to_csv(
    "/content/dataset_participants.csv",
    index=False
)

Датасет для участников: (17495, 6)


In [37]:
cols_internal = [
    "Molecule ChEMBL ID",
    "Smiles",
    "Standard Type",
    "Standard Value",
    "Standard Units",
    "pChEMBL Value",
    "MW", "logP", "HBD", "HBA", "RB"
]

df_internal = df[cols_internal].copy()
df_internal.reset_index(drop=True, inplace=True)

print("Внутренний датасет:", df_internal.shape)

df_internal.to_csv(
    "/content/ampc_dataset_internal.csv",
    index=False
)

Внутренний датасет: (17495, 11)
