In [None]:
import pandas as pd
import os
import numpy as np
from scipy import sparse


from fg_funcs import mol_to_fingerprint, safe_mol_from_smiles, fg_to_array, fp_to_array

In [None]:
# Load the dataset
data_path = 'data/chembl_35_fg_scaf.csv'
if os.path.exists(data_path):
    chembl = pd.read_csv(data_path)
else:
    raise FileNotFoundError(f"Dataset not found at {data_path}")

In [None]:
# Convert each fgs entry to a list
chembl['fgs'] = chembl['fgs'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# drop rows with empty fgs
chembl = chembl[chembl['fgs'].notna() & (chembl['fgs'].str.len() > 0)]

In [None]:
chembl.shape

In [None]:
# Curate dataset using 50 functional groups
fgs_list = []
for fgs in chembl['fgs']:
    if isinstance(fgs, list):
        fgs_list.extend(fgs)
sorted_fgs = pd.Series(fgs_list).value_counts().head(50).index.tolist()

print(sorted_fgs)

In [None]:
# Define chunk size
chunk_size = 100000
num_chunks = (len(chembl) // chunk_size) + 1

processed_chunks = []

for i in range(num_chunks):
    chunk_path = f"full_chembl_chunks/full_chembl_chunk_{i}.csv"
    if os.path.exists(chunk_path):
        print(f"Chunk {i+1}/{num_chunks} already processed. Skipping...")
        
    else:
        print(f"Processing chunk {i+1}/{num_chunks}...")
    
        # Extract chunk
        chunk = chembl.iloc[i*chunk_size : (i+1)*chunk_size].copy()

        chunk['mol'] = chunk['smiles'].apply(safe_mol_from_smiles)
        
        # Fingerprints and functional group arrays
        chunk['fingerprint'] = chunk['mol'].apply(mol_to_fingerprint)
        chunk['fingerprint_array'] = chunk['fingerprint'].apply(
            lambda x: fp_to_array(x) if x is not None else None
        )
        chunk['fg_array'] = chunk['fgs'].apply(lambda x: fg_to_array(x, sorted_fgs))

        # Convert to numpy arrays
        chunk['fingerprint_array'] = chunk['fingerprint_array'].apply(
            lambda x: x if isinstance(x, np.ndarray) else np.zeros((2048,), dtype=int)
        )
        chunk['fg_array'] = chunk['fg_array'].apply(
            lambda x: x if isinstance(x, np.ndarray) else np.zeros((len(sorted_fgs),), dtype=int)
        )

        # Remove rows where fg_array is all zeros
        chunk = chunk[chunk['fg_array'].apply(lambda x: np.any(x))]

        # Only keep necessary columns
        chunk = chunk[['smiles', 'fgs', 'fingerprint_array', 'fg_array']]

        # --- INPLACE FIX: Convert arrays to comma-separated strings before saving ---
        chunk['fingerprint_array'] = chunk['fingerprint_array'].apply(
            lambda x: ','.join(map(str, x.tolist()))
        )
        chunk['fg_array'] = chunk['fg_array'].apply(
            lambda x: ','.join(map(str, x.tolist()))
        )

        # Save checkpoint
        if not os.path.exists("full_chembl_chunks"):
            os.makedirs("full_chembl_chunks")
        print(f"Saving chunk {i+1}/{num_chunks} to disk...")
        chunk.to_csv(chunk_path, index=False)

# Combine all processed chunks
# load processed chunks
processed_chunks = []
for i in range(num_chunks):
    chunk_path = f"full_chembl_chunks/full_chembl_chunk_{i}.csv"
    if os.path.exists(chunk_path):
        processed_chunk = pd.read_csv(chunk_path)
        
        processed_chunks.append(processed_chunk)
    else:
        print(f"Chunk {i+1}/{num_chunks} not found. Skipping...")
        
chembl_processed = pd.concat(processed_chunks, ignore_index=True)
print(f"Final processed dataset shape: {chembl_processed.shape}")

In [None]:
# Convert fg_array to sparse components
chembl_processed['fg_data'] = chembl_processed['fg_array'].apply(
    lambda x: sparse.csr_matrix(np.fromstring(x, sep=','))
)
chembl_processed['fg_data_values'] = chembl_processed['fg_data'].apply(lambda x: x.data.tolist())
chembl_processed['fg_indices'] = chembl_processed['fg_data'].apply(lambda x: x.indices.tolist())
chembl_processed['fg_indptr'] = chembl_processed['fg_data'].apply(lambda x: x.indptr.tolist())
chembl_processed['fg_length'] = chembl_processed['fg_data'].apply(lambda x: x.shape[1])

# Same for fingerprint_array
chembl_processed['fp_data'] = chembl_processed['fingerprint_array'].apply(
    lambda x: sparse.csr_matrix(np.fromstring(x, sep=','))
)
chembl_processed['fp_data_values'] = chembl_processed['fp_data'].apply(lambda x: x.data.tolist())
chembl_processed['fp_indices'] = chembl_processed['fp_data'].apply(lambda x: x.indices.tolist())
chembl_processed['fp_indptr'] = chembl_processed['fp_data'].apply(lambda x: x.indptr.tolist())
chembl_processed['fp_length'] = chembl_processed['fp_data'].apply(lambda x: x.shape[1])

chembl_processed = chembl_processed.drop(columns=['fg_array', 'fingerprint_array', 'fg_data', 'fp_data'])

In [None]:
import shutil

# Save the processed data to a CSV file
output_file = 'data/chembl_35_fg_full.csv'
chembl_processed.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

chunks_dir = "full_chembl_chunks"
if os.path.exists(chunks_dir):
    shutil.rmtree(chunks_dir)
    print(f"Deleted directory: {chunks_dir}")
else:
    print(f"Directory not found: {chunks_dir}")

In [None]:
chembl_processed.head(1)