# **Dataset Creation**
### Notebook for the testing and development of a methodology for creating a fully labelled dataset of molecules and their functional groups.

In [None]:
import rdkit.Chem as Chem
from joblib import Parallel, delayed
import pandas as pd
import pickle
import os
import math

from fg_funcs import save_chunk_results, safe_mol_from_smiles, compute_scaffold_safe, compute_efgs_safe

## Dataset Creation
#### Starting with the small Pec50 data

## Chembl Dataset

In [None]:
# Load the chembl data
chembl_data = pd.read_csv('data/chembl_35_cleaned.csv',header=None, names=['smiles'])

In [None]:
# 1. Convert SMILES to RDKit Mol
chembl_data['rdkit_mol'] = chembl_data['smiles'].apply(safe_mol_from_smiles)
print('Converted SMILES to RDKit Mol objects for ChEMBL dataset.')


# 2. Generate InChIKeys
chembl_data['inchikey'] = chembl_data['rdkit_mol'].apply(
    lambda mol: Chem.MolToInchiKey(mol) if mol is not None else None
)
print('Generated InChIKeys for ChEMBL dataset.')

[10:55:13] Explicit valence for atom # 7 P, 7, is greater than permitted
[10:57:07] Explicit valence for atom # 10 Si, 6, is greater than permitted
[10:57:36] Explicit valence for atom # 1 P, 7, is greater than permitted
[10:58:45] Explicit valence for atom # 1 As, 7, is greater than permitted


Converted SMILES to RDKit Mol objects for ChEMBL dataset.
Generated InChIKeys for ChEMBL dataset.


In [5]:
# Drop rows with None values in 'rdkit_mol' or 'inchikey'
chembl_data = chembl_data.dropna(subset=['rdkit_mol', 'inchikey'])

In [7]:
# Configuration
chunk_size = 100000
output_dir = "chembl_chunks"
os.makedirs(output_dir, exist_ok=True)

In [None]:
total_mols = len(chembl_data)
num_chunks = math.ceil(total_mols / chunk_size)

In [10]:
# Process in chunks
for chunk_idx in range(num_chunks):
    start = chunk_idx * chunk_size
    end = min((chunk_idx + 1) * chunk_size, total_mols)
    
    print(f"Processing chunk {chunk_idx + 1}/{num_chunks} ({start}:{end})...")
    mols_chunk = chembl_data['rdkit_mol'].iloc[start:end]

    # Only compute if the file doesn't already exist (for resumability)
    chunk_file = os.path.join(output_dir, f"chunk_{chunk_idx}.pkl")
    if not os.path.exists(chunk_file):
        chunk_results = Parallel(n_jobs=-1)(
            delayed(compute_efgs_safe)(mol) for mol in mols_chunk
        )
        save_chunk_results(output_dir, chunk_idx, chunk_results)
    else:
        print(f"Chunk {chunk_idx + 1} already computed, skipping.")

# Recombine results
all_results = []
for chunk_idx in range(num_chunks):
    with open(os.path.join(output_dir, f"chunk_{chunk_idx}.pkl"), "rb") as f:
        chunk_results = pickle.load(f)
        all_results.extend(chunk_results)

# Store or attach results
chembl_data['psmis'] = all_results
print('All PSMIs added to ChEMBL dataset.')

Processing chunk 1/24 (0:100000)...
Chunk 1 already computed, skipping.
Processing chunk 2/24 (100000:200000)...
Chunk 2 already computed, skipping.
Processing chunk 3/24 (200000:300000)...
Chunk 3 already computed, skipping.
Processing chunk 4/24 (300000:400000)...
Chunk 4 already computed, skipping.
Processing chunk 5/24 (400000:500000)...
Chunk 5 already computed, skipping.
Processing chunk 6/24 (500000:600000)...
Chunk 6 already computed, skipping.
Processing chunk 7/24 (600000:700000)...
Chunk 7 already computed, skipping.
Processing chunk 8/24 (700000:800000)...
Chunk 8 already computed, skipping.
Processing chunk 9/24 (800000:900000)...
Chunk 9 already computed, skipping.
Processing chunk 10/24 (900000:1000000)...
Chunk 10 already computed, skipping.
Processing chunk 11/24 (1000000:1100000)...
Chunk 11 already computed, skipping.
Processing chunk 12/24 (1100000:1200000)...
Chunk 12 already computed, skipping.
Processing chunk 13/24 (1200000:1300000)...
Chunk 13 already computed,

In [11]:
print(all_results[0])  # Print the first result to verify

['O=C1C(=N[N]([R])[R])C=N[N]1[R]', '[OH][Car]', '[Nar]', '[Nar]', '[Nar]', '[Nar]']


In [12]:
# Config
chunk_size = 100000
scaffold_output_dir = "chembl_scaffold_chunks"
os.makedirs(scaffold_output_dir, exist_ok=True)

total_mols = len(chembl_data)
num_chunks = math.ceil(total_mols / chunk_size)

In [13]:
# Chunked scaffold computation
for chunk_idx in range(num_chunks):
    start = chunk_idx * chunk_size
    end = min((chunk_idx + 1) * chunk_size, total_mols)

    print(f"Processing scaffold chunk {chunk_idx + 1}/{num_chunks} ({start}:{end})...")
    mols_chunk = chembl_data['rdkit_mol'].iloc[start:end]

    chunk_file = os.path.join(scaffold_output_dir, f"chunk_{chunk_idx}.pkl")
    if not os.path.exists(chunk_file):
        # Compute scaffold mols
        scaffold_mols = Parallel(n_jobs=-1)(
            delayed(compute_scaffold_safe)(mol) for mol in mols_chunk
        )
        # Convert to SMILES
        scaffold_smiles = [
            Chem.MolToSmiles(mol) if mol else None for mol in scaffold_mols
        ]
        save_chunk_results(scaffold_output_dir, chunk_idx, scaffold_smiles)
    else:
        print(f"Scaffold chunk {chunk_idx + 1} already computed, skipping.")

# Recombine scaffold SMILES
all_scaffold_smiles = []
for chunk_idx in range(num_chunks):
    with open(os.path.join(scaffold_output_dir, f"chunk_{chunk_idx}.pkl"), "rb") as f:
        scaffold_smiles = pickle.load(f)
        all_scaffold_smiles.extend(scaffold_smiles)

# Store in DataFrame
chembl_data['scaffolds'] = all_scaffold_smiles
print('All scaffolds (as SMILES) added to ChEMBL dataset.')


Processing scaffold chunk 1/24 (0:100000)...
Scaffold chunk 1 already computed, skipping.
Processing scaffold chunk 2/24 (100000:200000)...
Scaffold chunk 2 already computed, skipping.
Processing scaffold chunk 3/24 (200000:300000)...
Scaffold chunk 3 already computed, skipping.
Processing scaffold chunk 4/24 (300000:400000)...
Scaffold chunk 4 already computed, skipping.
Processing scaffold chunk 5/24 (400000:500000)...
Scaffold chunk 5 already computed, skipping.
Processing scaffold chunk 6/24 (500000:600000)...
Scaffold chunk 6 already computed, skipping.
Processing scaffold chunk 7/24 (600000:700000)...
Scaffold chunk 7 already computed, skipping.
Processing scaffold chunk 8/24 (700000:800000)...
Scaffold chunk 8 already computed, skipping.
Processing scaffold chunk 9/24 (800000:900000)...
Scaffold chunk 9 already computed, skipping.
Processing scaffold chunk 10/24 (900000:1000000)...
Scaffold chunk 10 already computed, skipping.
Processing scaffold chunk 11/24 (1000000:1100000)...

In [None]:
chembl_data_final = chembl_data[['smiles', 'inchikey', 'psmis', 'scaffolds']]
chembl_data_final.rename(columns={'psmis': 'fgs'}, inplace=True)
chembl_data_final.to_csv('data/chembl_35_fg_scaf.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_data_final.rename(columns={'psmis': 'fgs'}, inplace=True)
