In [None]:
# Setup
from pathlib import Path
from loguru import logger
from tqdm import tqdm

scratch_dir = Path("scratch")
scratch_dir.mkdir(parents=True, exist_ok=True)

logger.add(scratch_dir / "analysis.log", rotation="1 MB", retention=3)
logger.info("Notebook started")



In [None]:
from __future__ import annotations

import json
import tarfile
from pathlib import Path
from typing import List, Optional

import pandas as pd
from loguru import logger
from tqdm import tqdm

# Optional RDKit import for SDF handling
try:
    from rdkit import Chem  # type: ignore
    _HAS_RDKIT = True
except Exception as rdkit_exc:  # pragma: no cover
    logger.warning("RDKit not available: {}. SDF parsing will not work.", rdkit_exc)
    Chem = None  # type: ignore
    _HAS_RDKIT = False


def read_csv_column(path: Path, column: str) -> pd.DataFrame:
    """Read a CSV and return only the requested column as a DataFrame.

    Parameters
    ----------
    path : Path
        Path to the CSV file.
    column : str
        Column name to select.
    """
    logger.info("Loading CSV: {} (column={})", path, column)
    df = pd.read_csv(path)
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in {path}. Available: {list(df.columns)}")
    return df[[column]].dropna().reset_index(drop=True)


def read_xls_column(path: Path, column: str) -> pd.DataFrame:
    """Read an Excel file and return only the requested column as a DataFrame."""
    logger.info("Loading Excel: {} (column={})", path, column)
    df = pd.read_excel(path)
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in {path}. Available: {list(df.columns)}")
    return df[[column]].dropna().reset_index(drop=True)


def read_dat_delimited_column(path: Path, column: str, sep: Optional[str] = None) -> pd.DataFrame:
    """Read a .dat file and return only the requested column.

    Tries tab, comma, and pipe separators if none is provided.
    """
    logger.info("Loading DAT: {} (column={})", path, column)
    seps = [sep] if sep else ["\t", ",", "|"]
    last_err: Optional[Exception] = None
    for s in seps:
        try:
            df = pd.read_csv(path, sep=s)
            if column in df.columns:
                return df[[column]].dropna().reset_index(drop=True)
            last_err = KeyError(f"Column '{column}' not in columns parsed with sep '{s}'")
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Failed to parse {path} for column '{column}'. Last error: {last_err}")


def read_sdf_smiles(path: Path, smiles_field: str = "canonical_smiles") -> pd.DataFrame:
    """Read an SDF file and extract SMILES from a specified property or from the molecule."""
    if not _HAS_RDKIT:
        raise ImportError("RDKit is required to parse SDF files. Please install rdkit-pypi.")
    logger.info("Loading SDF: {} (field={})", path, smiles_field)
    supplier = Chem.SDMolSupplier(str(path), removeHs=False)  # type: ignore[attr-defined]
    smiles_list: List[str] = []
    for mol in tqdm(supplier, desc=f"Reading {path.name}"):
        if mol is None:
            continue
        value = mol.GetProp(smiles_field) if mol.HasProp(smiles_field) else None
        if not value:
            try:
                value = Chem.MolToSmiles(mol)  # type: ignore[attr-defined]
            except Exception:
                value = None
        if value:
            smiles_list.append(value)
    return pd.DataFrame({"canonical_smiles": smiles_list}).dropna().reset_index(drop=True)


def read_glycan_csv_fourth_column(path: Path) -> pd.DataFrame:
    """Read a CSV and return the 4th column (index 3)."""
    logger.info("Loading Glycan CSV: {} (4th column)", path)
    df = pd.read_csv(path, header=0)
    if df.shape[1] < 4:
        raise ValueError(f"Expected at least 4 columns in {path}, got {df.shape[1]}")
    col_name = df.columns[3]
    out = df[[col_name]].dropna().reset_index(drop=True)
    out.columns = ["glycan_field"]
    return out


def read_rips_from_tar_json(path: Path, json_field: str = "translation") -> pd.DataFrame:
    """Extract a field from all JSON files inside a tar archive into a DataFrame."""
    logger.info("Loading RIPs from TAR JSON: {} (field={})", path, json_field)
    values: List[str] = []
    with tarfile.open(path, "r") as tf:
        members = [m for m in tf.getmembers() if m.isfile() and m.name.lower().endswith(".json")]
        for m in tqdm(members, desc=f"Reading {path.name}"):
            f = tf.extractfile(m)
            if f is None:
                continue
            try:
                data = json.load(f)
                if isinstance(data, dict) and json_field in data and data[json_field]:
                    values.append(str(data[json_field]))
            except Exception as e:
                logger.warning("Failed to parse {} inside {}: {}", m.name, path.name, e)
            finally:
                f.close()
    return pd.DataFrame({json_field: values}).dropna().reset_index(drop=True)

logger.info("Data loader utilities ready.")



In [None]:
# Define paths
SM = Path("/fsx/data/raw/drugbank/DrugBank_SM_drugs.csv")
oligos = Path("/fsx/data/raw/DNA.RNA_seq/random_DNA_RNA_sequences_10000.csv")
can_peptides = Path("/fsx/data/raw/TPDB/main.xls")
noncan_peptides = Path("/fsx/data/raw/NCPbook_noncanonicals/NCP.book_Homo_sapiens.dat")
cyclic_pep_lariat = Path("/fsx/data/raw/CycPeptMPDB/CycPeptMPDB_Peptide_Shape_Lariat.csv")
cyclic_pep_circle = Path("/fsx/data/raw/CycPeptMPDB/CycPeptMPDB_Peptide_Shape_Circle.csv")
nat_prod = Path("/fsx/data/raw/supernatural/supernatural3-11-2025.sdf")
glycans = Path("/fsx/data/raw/Glytoucan/glycan.csv")
RIPs = Path("/fsx/data/raw/mibig/mibig_json_4.0.tar")

# Load datasets into DataFrames
logger.info("Starting dataset imports...")

# Approved drugs (SMs): column 'moldb_smiles'
df_sm = read_csv_column(SM, "moldb_smiles")

# Oligos/Nucleotides: column 'Sequence'
df_oligos = read_csv_column(oligos, "Sequence")

# Peptides (canonical): column 'Sequence'
df_can_peptides = read_xls_column(can_peptides, "Sequence")

# Non-canonical peptides: column 'NCP_sequence'
df_noncan_peptides = read_dat_delimited_column(noncan_peptides, "NCP_sequence")

# CycPeptMPDB (Macrocycles-lariat): column 'SMILES'
df_cyclic_pep_lariat = read_csv_column(cyclic_pep_lariat, "SMILES")

# CycPeptMPDB (Macrocycles-circular): column 'SMILES'
df_cyclic_pep_circle = read_csv_column(cyclic_pep_circle, "SMILES")

# Natural products: SDF field 'canonical_smiles'
df_nat_prod = read_sdf_smiles(nat_prod, smiles_field="canonical_smiles")

# Glycans: 4th column
df_glycans = read_glycan_csv_fourth_column(glycans)

# RIPs: 'translation' field in each JSON inside tar
df_rips = read_rips_from_tar_json(RIPs, json_field="translation")

# Basic summary
summary = {
    "df_sm": len(df_sm),
    "df_oligos": len(df_oligos),
    "df_can_peptides": len(df_can_peptides),
    "df_noncan_peptides": len(df_noncan_peptides),
    "df_cyclic_pep_lariat": len(df_cyclic_pep_lariat),
    "df_cyclic_pep_circle": len(df_cyclic_pep_circle),
    "df_nat_prod": len(df_nat_prod),
    "df_glycans": len(df_glycans),
    "df_rips": len(df_rips),
}
logger.info("Import summary: {}", summary)
summary
