In [27]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [28]:
fingerprinted_data_directory = '/Users/Avi/Dissertation/Data/Curated/Fingerprinted'

scaffold_split_directory = '/Users/Avi/Dissertation/Data/Curated/Split/Scaffold'
os.makedirs(scaffold_split_directory, exist_ok=True)

random_split_directory = '/Users/Avi/Dissertation/Data/Curated/Split/Random'
os.makedirs(random_split_directory, exist_ok=True)

In [29]:
target_CHEMBL4078 = pd.read_csv(os.path.join(fingerprinted_data_directory, 'Target_CHEMBL4078_Curated_Fingerprinted.csv'))
target_CHEMBL279 = pd.read_csv(os.path.join(fingerprinted_data_directory, 'Target_CHEMBL279_Curated_Fingerprinted.csv'))
target_CHEMBL5763 = pd.read_csv(os.path.join(fingerprinted_data_directory, 'Target_CHEMBL5763_Curated_Fingerprinted.csv'))
target_CHEMBL240 = pd.read_csv(os.path.join(fingerprinted_data_directory, 'Target_CHEMBL240_Curated_Fingerprinted.csv'))
target_CHEMBL4005 = pd.read_csv(os.path.join(fingerprinted_data_directory, 'Target_CHEMBL4005_Curated_Fingerprinted.csv'))

### Scaffold Split

In [30]:
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold)

def allocate_indices(indices, set_size):
    allocated_set = indices[:set_size]
    remaining_indices = indices[set_size:]
    return allocated_set, remaining_indices

In [31]:
targets = ['CHEMBL4078', 'CHEMBL279', 'CHEMBL5763', 'CHEMBL240', 'CHEMBL4005']

In [32]:
for target in targets:
    # Load the fingerprinted data
    df = pd.read_csv(os.path.join(fingerprinted_data_directory, f'Target_{target}_Curated_Fingerprinted.csv'))
    
    # Group by scaffolds
    scaffold_to_molecules = defaultdict(list)
    for idx, row in df.iterrows():
        smiles = row['canonical_smiles']
        scaffold = get_scaffold(smiles)
        scaffold_to_molecules[scaffold].append(idx)
    
    scaffold_bins = list(scaffold_to_molecules.items())
    np.random.shuffle(scaffold_bins)
    
    total_size = len(df)
    test_size = int(0.2 * total_size)
    train_size = total_size - test_size
    
    train_indices = []
    remaining_bins = []
    
    for scaffold, indices in scaffold_bins:
        if len(indices) > test_size // 2:
            train_indices.extend(indices)
        else:
            remaining_bins.append((scaffold, indices))
    
    np.random.shuffle(remaining_bins)
    remaining_indices = [idx for scaffold, indices in remaining_bins for idx in indices]
    
    test_indices, remaining_indices = allocate_indices(remaining_indices, test_size)
    train_indices.extend(remaining_indices)
    
    train_df = df.iloc[train_indices]
    test_df = df.iloc[test_indices]
    
    # Save the split data
    train_df.to_csv(os.path.join(scaffold_split_directory, f'{target}_Train_Scaffold.csv'), index=False)
    test_df.to_csv(os.path.join(scaffold_split_directory, f'{target}_Test_Scaffold.csv'), index=False)
    
    print(f"Scaffold split for {target} completed. Training set size: {len(train_df)}, Test set size: {len(test_df)}")

Scaffold split for CHEMBL4078 completed. Training set size: 3004, Test set size: 751
Scaffold split for CHEMBL279 completed. Training set size: 2676, Test set size: 668
Scaffold split for CHEMBL5763 completed. Training set size: 2160, Test set size: 540
Scaffold split for CHEMBL240 completed. Training set size: 2254, Test set size: 563
Scaffold split for CHEMBL4005 completed. Training set size: 2252, Test set size: 562


### Random Split

In [34]:
def random_split(target):
    df = pd.read_csv(os.path.join(fingerprinted_data_directory, f'Target_{target}_Curated_Fingerprinted.csv'))
    
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    train_df.to_csv(os.path.join(random_split_directory, f'{target}_Train_Random.csv'), index=False)
    test_df.to_csv(os.path.join(random_split_directory, f'{target}_Test_Random.csv'), index=False)
    
    print(f"Random split for {target} completed. Training set size: {len(train_df)}, Test set size: {len(test_df)}")

for target in targets:
    random_split(target)

Random split for CHEMBL4078 completed. Training set size: 3004, Test set size: 751
Random split for CHEMBL279 completed. Training set size: 2675, Test set size: 669
Random split for CHEMBL5763 completed. Training set size: 2160, Test set size: 540
Random split for CHEMBL240 completed. Training set size: 2253, Test set size: 564
Random split for CHEMBL4005 completed. Training set size: 2251, Test set size: 563
