# **Dataset Creation**
### Notebook for the testing and development of a methodology for creating a fully labelled dataset of molecules and their functional groups.

In [None]:
import rdkit.Chem as Chem
from joblib import Parallel, delayed
import pandas as pd
import pickle
import os
import math

from fg_funcs import save_chunk_results, safe_mol_from_smiles, compute_scaffold_safe, compute_efgs_safe

## Dataset Creation
#### Starting with the small Pec50 data

## Chembl Dataset

In [None]:
# Load the chembl data
chembl_data = pd.read_csv('data/chembl_35_cleaned.csv',header=None, names=['smiles'])

In [None]:
# 1. Convert SMILES to RDKit Mol
chembl_data['rdkit_mol'] = chembl_data['smiles'].apply(safe_mol_from_smiles)
print('Converted SMILES to RDKit Mol objects for ChEMBL dataset.')


# 2. Generate InChIKeys
chembl_data['inchikey'] = chembl_data['rdkit_mol'].apply(
    lambda mol: Chem.MolToInchiKey(mol) if mol is not None else None
)
print('Generated InChIKeys for ChEMBL dataset.')

In [None]:
# Drop rows with None values in 'rdkit_mol' or 'inchikey'
chembl_data = chembl_data.dropna(subset=['rdkit_mol', 'inchikey'])

In [None]:
# Configuration
chunk_size = 100000
output_dir = "chembl_chunks"
os.makedirs(output_dir, exist_ok=True)

In [None]:
total_mols = len(chembl_data)
num_chunks = math.ceil(total_mols / chunk_size)

In [None]:
# Process in chunks
for chunk_idx in range(num_chunks):
    start = chunk_idx * chunk_size
    end = min((chunk_idx + 1) * chunk_size, total_mols)
    
    print(f"Processing chunk {chunk_idx + 1}/{num_chunks} ({start}:{end})...")
    mols_chunk = chembl_data['rdkit_mol'].iloc[start:end]

    # Only compute if the file doesn't already exist (for resumability)
    chunk_file = os.path.join(output_dir, f"chunk_{chunk_idx}.pkl")
    if not os.path.exists(chunk_file):
        chunk_results = Parallel(n_jobs=-1)(
            delayed(compute_efgs_safe)(mol) for mol in mols_chunk
        )
        save_chunk_results(output_dir, chunk_idx, chunk_results)
    else:
        print(f"Chunk {chunk_idx + 1} already computed, skipping.")

# Recombine results
all_results = []
for chunk_idx in range(num_chunks):
    with open(os.path.join(output_dir, f"chunk_{chunk_idx}.pkl"), "rb") as f:
        chunk_results = pickle.load(f)
        all_results.extend(chunk_results)

# Store or attach results
chembl_data['psmis'] = all_results
print('All PSMIs added to ChEMBL dataset.')

In [None]:
print(all_results[0])  # Print the first result to verify

In [None]:
# Config
chunk_size = 100000
scaffold_output_dir = "chembl_scaffold_chunks"
os.makedirs(scaffold_output_dir, exist_ok=True)

total_mols = len(chembl_data)
num_chunks = math.ceil(total_mols / chunk_size)

In [None]:
# Chunked scaffold computation
for chunk_idx in range(num_chunks):
    start = chunk_idx * chunk_size
    end = min((chunk_idx + 1) * chunk_size, total_mols)

    print(f"Processing scaffold chunk {chunk_idx + 1}/{num_chunks} ({start}:{end})...")
    mols_chunk = chembl_data['rdkit_mol'].iloc[start:end]

    chunk_file = os.path.join(scaffold_output_dir, f"chunk_{chunk_idx}.pkl")
    if not os.path.exists(chunk_file):
        # Compute scaffold mols
        scaffold_mols = Parallel(n_jobs=-1)(
            delayed(compute_scaffold_safe)(mol) for mol in mols_chunk
        )
        # Convert to SMILES
        scaffold_smiles = [
            Chem.MolToSmiles(mol) if mol else None for mol in scaffold_mols
        ]
        save_chunk_results(scaffold_output_dir, chunk_idx, scaffold_smiles)
    else:
        print(f"Scaffold chunk {chunk_idx + 1} already computed, skipping.")

# Recombine scaffold SMILES
all_scaffold_smiles = []
for chunk_idx in range(num_chunks):
    with open(os.path.join(scaffold_output_dir, f"chunk_{chunk_idx}.pkl"), "rb") as f:
        scaffold_smiles = pickle.load(f)
        all_scaffold_smiles.extend(scaffold_smiles)

# Store in DataFrame
chembl_data['scaffolds'] = all_scaffold_smiles
print('All scaffolds (as SMILES) added to ChEMBL dataset.')


In [None]:
chembl_data_final = chembl_data[['smiles', 'inchikey', 'psmis', 'scaffolds']]
chembl_data_final.rename(columns={'psmis': 'fgs'}, inplace=True)
chembl_data_final.to_csv('data/chembl_35_fg_scaf.csv', index=False)