In [1]:
from rdkit import Chem
import selfies as sf
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt


In [2]:
input_file = "../downloads/CID-SMILES"

In [8]:
import csv
from rdkit import Chem
from joblib import Parallel, delayed

def canonicalize_smiles(smiles: str) -> str:
    """
    Convert a SMILES string to its canonical form using RDKit.
    Returns None if invalid.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, canonical=True)
    return None

def parse_and_canonicalize_line(line: str) -> tuple:
    """
    Given one line from the text file, parse out the SMILES part, 
    canonicalize it, and return (original_line, canonical_smiles).
    If invalid or can't parse, return (None, None).
    """
    line = line.strip()
    if not line:
        return (None, None)  # Empty line
    
    # Example logic: line is "ID<TAB>SMILES"
    parts = line.split("\t")
    if len(parts) < 2:
        # If there's no tab or no second part, skip
        return (None, None)
    
    smiles = parts[1].strip()
    canonical = canonicalize_smiles(smiles)
    if canonical is None:
        return (None, None)  # Invalid SMILES
    
    return (line, canonical)

def process_chunk(lines, unique_smiles_set, csv_writer):
    """
    Takes a chunk of lines, parallelizes canonicalization, 
    and writes unique canonical SMILES to CSV.
    """
    # Optionally, you can remove parallelization if you prefer single-threaded.
    # results = [parse_and_canonicalize_line(line) for line in lines]
    
    results = Parallel(n_jobs=-1, backend="loky")(
        delayed(parse_and_canonicalize_line)(line) for line in lines
    )
    
    for original_line, canon in results:
        if canon is not None:
            # Deduplicate across all processed chunks
            if canon not in unique_smiles_set:
                csv_writer.writerow([original_line, canon])
                unique_smiles_set.add(canon)

# -----------------------------------------------------------------------------

input_file = "../downloads/CID-SMILES"  # Plain text file with lines to parse
output_file = "cleaned_smiles.csv"
chunk_size = 100_000

# Global set of canonical SMILES to handle deduplication across chunks
unique_smiles = set()

with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["original_line", "canonical_smiles"])  # Adjust columns as needed
    
    # We'll accumulate lines in a buffer, then process them in chunks
    buffer_lines = []
    with open(input_file, "r") as infile:
        for line in infile:
            buffer_lines.append(line)
            if len(buffer_lines) >= chunk_size:
                # Process this chunk
                process_chunk(buffer_lines, unique_smiles, writer)
                buffer_lines.clear()
        
        # Process any leftover lines in the final partial chunk
        if buffer_lines:
            process_chunk(buffer_lines, unique_smiles, writer)


[14:56:04] Explicit valence for atom # 1 Br, 3, is greater than permitted
[14:56:04] Explicit valence for atom # 1 Br, 5, is greater than permitted
[14:56:04] Explicit valence for atom # 1 Cl, 3, is greater than permitted
[14:56:04] Explicit valence for atom # 1 Cl, 7, is greater than permitted
[14:56:06] Explicit valence for atom # 5 Cl, 2, is greater than permitted
[14:56:06] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[14:56:06] Explicit valence for atom # 1 Cl, 5, is greater than permitted
[14:56:07] Explicit valence for atom # 4 Cl, 3, is greater than permitted
[14:56:08] Explicit valence for atom # 11 Br, 2, is greater than permitted
[14:56:08] Explicit valence for atom # 1 Br, 2, is greater than permitted
[14:56:08] Explicit valence for atom # 1 Kr, 2, is greater than permitted
[14:56:08] Explicit valence for atom # 6 Br, 2, is greater than permitted
[14:56:08] Explicit valence for atom # 1 I, 7, is greater than permitted
[14:56:11] Explicit valence for atom #