In [1]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Directory Setup

In [2]:
base_dir = '/Users/Avi/Dissertation/Data'
fingerprinted_data_directory = os.path.join(base_dir, 'Non_Curated/Fingerprinted')
scaffold_test_data_directory = os.path.join(base_dir, 'Curated/Test_Data/Scaffold_Split')
random_test_data_directory = os.path.join(base_dir, 'Curated/Test_Data/Random_Split')
train_data_directory = os.path.join(base_dir, 'Non_Curated/Train_Data')
os.makedirs(train_data_directory, exist_ok=True)

# Assigning Targets

In [3]:
targets = ['CHEMBL4078', 'CHEMBL279', 'CHEMBL5763', 'CHEMBL240', 'CHEMBL4005']

# Defining Functions

In [4]:
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold)

def process_target(target):
    non_curated_df = pd.read_csv(os.path.join(fingerprinted_data_directory, f'Target_{target}_Non_Curated_Fingerprinted.csv'))
    scaffold_test_df = pd.read_csv(os.path.join(scaffold_test_data_directory, f'{target}_Test_Scaffold.csv'))
    random_test_df = pd.read_csv(os.path.join(random_test_data_directory, f'{target}_Test_Random.csv'))
    
    combined_test_df = pd.concat([scaffold_test_df, random_test_df]).drop_duplicates(subset='canonical_smiles')
    
    mask = ~non_curated_df['canonical_smiles'].isin(combined_test_df['canonical_smiles'])
    non_curated_train_df = non_curated_df.loc[mask].copy()  # Create an explicit copy
    
    scaffold_test_scaffolds = set(scaffold_test_df['canonical_smiles'].apply(get_scaffold))
    
    non_curated_train_df['scaffold'] = non_curated_train_df['canonical_smiles'].apply(get_scaffold)
    non_curated_train_df = non_curated_train_df[~non_curated_train_df['scaffold'].isin(scaffold_test_scaffolds)]
    non_curated_train_df = non_curated_train_df.drop('scaffold', axis=1)
    non_curated_train_df.to_csv(os.path.join(train_data_directory, f'{target}_Train.csv'), index=False)
    
    return len(non_curated_train_df)

# Main Execution

In [5]:
for target in targets:
    train_size = process_target(target)
    print(f"Non-curated training set for {target} prepared. Training set size: {train_size}")

Non-curated training set for CHEMBL4078 prepared. Training set size: 2512
Non-curated training set for CHEMBL279 prepared. Training set size: 4717
Non-curated training set for CHEMBL5763 prepared. Training set size: 2104
Non-curated training set for CHEMBL240 prepared. Training set size: 6386
Non-curated training set for CHEMBL4005 prepared. Training set size: 4080
