In [None]:
import pandas as pd
import os
import numpy as np

from fg_funcs import mol_to_fingerprint, safe_mol_from_smiles, fg_to_array, fp_to_array

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
data_path = 'chembl_35_fg_scaf.csv'
if os.path.exists(data_path):
    chembl = pd.read_csv(data_path)
else:
    raise FileNotFoundError(f"Dataset not found at {data_path}")

In [3]:
# Convert each fgs entry to a list
chembl['fgs'] = chembl['fgs'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# drop rows with empty fgs
chembl = chembl[chembl['fgs'].notna() & (chembl['fgs'].str.len() > 0)]

In [4]:
chembl.shape

(2310581, 4)

In [5]:
# Get all unique functional groups
all_fgs = set()
for fgs in chembl['fgs']:
    if isinstance(fgs, list):
        all_fgs.update(fgs)

# Convert functional groups to a list
all_fgs = sorted(list(all_fgs))

In [6]:
# Define chunk size
chunk_size = 100000
num_chunks = (len(chembl) // chunk_size) + 1

processed_chunks = []

for i in range(num_chunks):
    if os.path.exists(f"full_chembl_chunks/full_chembl_chunk_{i}.csv"):
        print(f"Chunk {i+1}/{num_chunks} already processed. Skipping...")
        
    else:
        print(f"Processing chunk {i+1}/{num_chunks}...")
    
        # Extract chunk
        chunk = chembl.iloc[i*chunk_size : (i+1)*chunk_size].copy()

        chunk['mol'] = chunk['smiles'].apply(safe_mol_from_smiles)
        chunk.head()

        # Fingerprints and functional group arrays
        chunk['fingerprint'] = chunk['mol'].apply(mol_to_fingerprint)
        chunk['fingerprint_array'] = chunk['fingerprint'].apply(
            lambda x: fp_to_array(x) if x is not None else None
        )
        chunk['fg_array'] = chunk['fgs'].apply(lambda x: fg_to_array(x, all_fgs))

        # Convert to numpy arrays
        chunk['fingerprint_array'] = chunk['fingerprint_array'].apply(
            lambda x: x if isinstance(x, np.ndarray) else np.zeros((2048,), dtype=int)
        )
        chunk['fg_array'] = chunk['fg_array'].apply(
            lambda x: x if isinstance(x, np.ndarray) else np.zeros((len(all_fgs),), dtype=int)
        )

        # Remove rows where fg_array is all zeros
        chunk = chunk[chunk['fg_array'].apply(lambda x: np.any(x))]

        # Only keep necessary columns
        chunk = chunk[['smiles', 'fgs', 'fingerprint_array', 'fg_array']]

        # Save checkpoint (optional, e.g. to CSV for resuming later)
        if not os.path.exists("full_chembl_chunks"):
            os.makedirs("full_chembl_chunks")
        print(f"Saving chunk {i+1}/{num_chunks} to disk...")
        chunk.to_csv(f"full_chembl_chunks/full_chembl_chunk_{i}.csv", index=False)

# Combine all processed chunks
# load processed chunks
processed_chunks = []
for i in range(num_chunks):
    chunk_path = f"full_chembl_chunks/full_chembl_chunk_{i}.csv"
    if os.path.exists(chunk_path):
        processed_chunk = pd.read_csv(chunk_path)
        processed_chunks.append(processed_chunk)
    else:
        print(f"Chunk {i+1}/{num_chunks} not found. Skipping...")
chembl_processed = pd.concat(processed_chunks, ignore_index=True)
print(f"Final processed dataset shape: {chembl_processed.shape}")

Chunk 1/24 already processed. Skipping...
Chunk 2/24 already processed. Skipping...
Chunk 3/24 already processed. Skipping...
Chunk 4/24 already processed. Skipping...
Chunk 5/24 already processed. Skipping...
Chunk 6/24 already processed. Skipping...
Chunk 7/24 already processed. Skipping...
Chunk 8/24 already processed. Skipping...
Chunk 9/24 already processed. Skipping...
Chunk 10/24 already processed. Skipping...
Chunk 11/24 already processed. Skipping...
Chunk 12/24 already processed. Skipping...
Chunk 13/24 already processed. Skipping...
Chunk 14/24 already processed. Skipping...
Chunk 15/24 already processed. Skipping...
Chunk 16/24 already processed. Skipping...
Chunk 17/24 already processed. Skipping...
Chunk 18/24 already processed. Skipping...
Chunk 19/24 already processed. Skipping...
Chunk 20/24 already processed. Skipping...
Chunk 21/24 already processed. Skipping...
Processing chunk 22/24...
Saving chunk 22/24 to disk...
Processing chunk 23/24...
Saving chunk 23/24 to di

In [None]:
# Save the processed data to a CSV file
output_file = 'chembl_35_fg_full.csv'
chembl_processed.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

Processed data saved to chembl_35_fg_scaf_full.csv
