In [1]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Directory Setup

In [2]:
base_dir = '/Users/Avi/Dissertation/Data'
fingerprinted_data_directory = os.path.join(base_dir, 'Curated/Fingerprinted')
scaffold_test_data_directory = os.path.join(base_dir, 'Curated/Test_Data/Scaffold_Split')
random_test_data_directory = os.path.join(base_dir, 'Curated/Test_Data/Random_Split')
train_data_directory = os.path.join(base_dir, 'Curated/Train_Data')

for directory in [scaffold_test_data_directory, random_test_data_directory, train_data_directory]:
    os.makedirs(directory, exist_ok=True)

# Assigning Targets

In [3]:
targets = ['CHEMBL4078', 'CHEMBL279', 'CHEMBL5763', 'CHEMBL240', 'CHEMBL4005']

# Defining Functions

In [4]:
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold)

def allocate_indices(indices, set_size):
    return indices[:set_size], indices[set_size:]

def process_target(target, random_state=42):
    df = pd.read_csv(os.path.join(fingerprinted_data_directory, f'Target_{target}_Curated_Fingerprinted.csv'))
    
    scaffold_to_molecules = defaultdict(list)
    for idx, smiles in enumerate(df['canonical_smiles']):
        scaffold = get_scaffold(smiles)
        scaffold_to_molecules[scaffold].append(idx)
    
    scaffold_bins = list(scaffold_to_molecules.items())
    np.random.RandomState(random_state).shuffle(scaffold_bins)
    
    total_size = len(df)
    test_size = int(0.2 * total_size)
    
    test_indices = []
    train_indices = []
    
    for scaffold, indices in scaffold_bins:
        if len(test_indices) + len(indices) <= test_size:
            test_indices.extend(indices)
        else:
            train_indices.extend(indices)
    
    train_df = df.iloc[train_indices]
    scaffold_test_df = df.iloc[test_indices]
    
    scaffold_test_df.to_csv(os.path.join(scaffold_test_data_directory, f'{target}_Test_Scaffold.csv'), index=False)
        
    random_train_df, random_test_df = train_test_split(train_df, test_size=0.25, random_state=random_state)
    
    random_test_df.to_csv(os.path.join(random_test_data_directory, f'{target}_Test_Random.csv'), index=False)
    random_train_df.to_csv(os.path.join(train_data_directory, f'{target}_Train.csv'), index=False)
    
    return len(random_train_df), len(scaffold_test_df), len(random_test_df)

# Main Execution

In [5]:
for target in targets:
    train_size, scaffold_test_size, random_test_size = process_target(target)
    print(f"Data split for {target} completed. Training set size: {train_size}, "
          f"Scaffold-Split Test set size: {scaffold_test_size}, "
          f"Random-Split Test set size: {random_test_size}")

Data split for CHEMBL4078 completed. Training set size: 2230, Scaffold-Split Test set size: 743, Random-Split Test set size: 744
Data split for CHEMBL279 completed. Training set size: 1961, Scaffold-Split Test set size: 653, Random-Split Test set size: 654
Data split for CHEMBL5763 completed. Training set size: 1605, Scaffold-Split Test set size: 534, Random-Split Test set size: 535
Data split for CHEMBL240 completed. Training set size: 1629, Scaffold-Split Test set size: 543, Random-Split Test set size: 543
Data split for CHEMBL4005 completed. Training set size: 1641, Scaffold-Split Test set size: 546, Random-Split Test set size: 547
