In [19]:
import pandas as pd
import os
import numpy as np
from scipy import sparse

from fg_funcs import mol_to_fingerprint, safe_mol_from_smiles, fg_to_array, fp_to_array

In [9]:
# Load the dataset
data_path = 'data/chembl_35_fg_scaf.csv'
if os.path.exists(data_path):
    chembl = pd.read_csv(data_path)
else:
    raise FileNotFoundError(f"Dataset not found at {data_path}")

In [10]:
# Convert each fgs entry to a list
chembl['fgs'] = chembl['fgs'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# drop rows with empty fgs
chembl = chembl[chembl['fgs'].notna() & (chembl['fgs'].str.len() > 0)]

In [11]:
chembl.shape

(2310581, 4)

In [12]:
# Get all unique functional groups
all_fgs = set()
for fgs in chembl['fgs']:
    if isinstance(fgs, list):
        all_fgs.update(fgs)

# Convert functional groups to a list
all_fgs = sorted(list(all_fgs))

In [13]:
# Define chunk size
chunk_size = 100000
num_chunks = (len(chembl) // chunk_size) + 1

processed_chunks = []

for i in range(num_chunks):
    chunk_path = f"full_chembl_chunks/full_chembl_chunk_{i}.csv"
    if os.path.exists(chunk_path):
        print(f"Chunk {i+1}/{num_chunks} already processed. Skipping...")
        
    else:
        print(f"Processing chunk {i+1}/{num_chunks}...")
    
        # Extract chunk
        chunk = chembl.iloc[i*chunk_size : (i+1)*chunk_size].copy()

        chunk['mol'] = chunk['smiles'].apply(safe_mol_from_smiles)
        
        # Fingerprints and functional group arrays
        chunk['fingerprint'] = chunk['mol'].apply(mol_to_fingerprint)
        chunk['fingerprint_array'] = chunk['fingerprint'].apply(
            lambda x: fp_to_array(x) if x is not None else None
        )
        chunk['fg_array'] = chunk['fgs'].apply(lambda x: fg_to_array(x, all_fgs))

        # Convert to numpy arrays
        chunk['fingerprint_array'] = chunk['fingerprint_array'].apply(
            lambda x: x if isinstance(x, np.ndarray) else np.zeros((2048,), dtype=int)
        )
        chunk['fg_array'] = chunk['fg_array'].apply(
            lambda x: x if isinstance(x, np.ndarray) else np.zeros((len(all_fgs),), dtype=int)
        )

        # Remove rows where fg_array is all zeros
        chunk = chunk[chunk['fg_array'].apply(lambda x: np.any(x))]

        # Only keep necessary columns
        chunk = chunk[['smiles', 'fgs', 'fingerprint_array', 'fg_array']]

        # --- INPLACE FIX: Convert arrays to comma-separated strings before saving ---
        chunk['fingerprint_array'] = chunk['fingerprint_array'].apply(
            lambda x: ','.join(map(str, x.tolist()))
        )
        chunk['fg_array'] = chunk['fg_array'].apply(
            lambda x: ','.join(map(str, x.tolist()))
        )

        # Save checkpoint
        if not os.path.exists("full_chembl_chunks"):
            os.makedirs("full_chembl_chunks")
        print(f"Saving chunk {i+1}/{num_chunks} to disk...")
        chunk.to_csv(chunk_path, index=False)

# Combine all processed chunks
# load processed chunks
processed_chunks = []
for i in range(num_chunks):
    chunk_path = f"full_chembl_chunks/full_chembl_chunk_{i}.csv"
    if os.path.exists(chunk_path):
        processed_chunk = pd.read_csv(chunk_path)
        
        processed_chunks.append(processed_chunk)
    else:
        print(f"Chunk {i+1}/{num_chunks} not found. Skipping...")
        
chembl_processed = pd.concat(processed_chunks, ignore_index=True)
print(f"Final processed dataset shape: {chembl_processed.shape}")

Processing chunk 1/24...
Saving chunk 1/24 to disk...
Processing chunk 2/24...
Saving chunk 2/24 to disk...
Processing chunk 3/24...
Saving chunk 3/24 to disk...
Processing chunk 4/24...
Saving chunk 4/24 to disk...
Processing chunk 5/24...
Saving chunk 5/24 to disk...
Processing chunk 6/24...
Saving chunk 6/24 to disk...
Processing chunk 7/24...
Saving chunk 7/24 to disk...
Processing chunk 8/24...
Saving chunk 8/24 to disk...
Processing chunk 9/24...
Saving chunk 9/24 to disk...
Processing chunk 10/24...
Saving chunk 10/24 to disk...
Processing chunk 11/24...
Saving chunk 11/24 to disk...
Processing chunk 12/24...
Saving chunk 12/24 to disk...
Processing chunk 13/24...
Saving chunk 13/24 to disk...
Processing chunk 14/24...
Saving chunk 14/24 to disk...
Processing chunk 15/24...
Saving chunk 15/24 to disk...
Processing chunk 16/24...
Saving chunk 16/24 to disk...
Processing chunk 17/24...
Saving chunk 17/24 to disk...
Processing chunk 18/24...
Saving chunk 18/24 to disk...
Processing

In [18]:
chembl_processed['fingerprint_array'].head()

0    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
1    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
2    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
3    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
4    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
Name: fingerprint_array, dtype: object

In [None]:
# Convert fg_array to sparse components
chembl_processed['fg_data'] = chembl_processed['fg_array'].apply(
    lambda x: sparse.csr_matrix(np.fromstring(x, sep=','))
)
chembl_processed['fg_data_values'] = chembl_processed['fg_data'].apply(lambda x: x.data.tolist())
chembl_processed['fg_indices'] = chembl_processed['fg_data'].apply(lambda x: x.indices.tolist())
chembl_processed['fg_indptr'] = chembl_processed['fg_data'].apply(lambda x: x.indptr.tolist())
chembl_processed['fg_length'] = chembl_processed['fg_data'].apply(lambda x: x.shape[1])

# Same for fingerprint_array
chembl_processed['fp_data'] = chembl_processed['fingerprint_array'].apply(
    lambda x: sparse.csr_matrix(np.fromstring(x, sep=','))
)
chembl_processed['fp_data_values'] = chembl_processed['fp_data'].apply(lambda x: x.data.tolist())
chembl_processed['fp_indices'] = chembl_processed['fp_data'].apply(lambda x: x.indices.tolist())
chembl_processed['fp_indptr'] = chembl_processed['fp_data'].apply(lambda x: x.indptr.tolist())
chembl_processed['fp_length'] = chembl_processed['fp_data'].apply(lambda x: x.shape[1])

chembl_processed = chembl_processed.drop(columns=['fg_array', 'fingerprint_array', 'fg_data', 'fp_data'])

In [16]:
# Save the processed data to a CSV file
output_file = 'data/chembl_35_fg_full.csv'
chembl_processed.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

OSError: [Errno 28] No space left on device