In [None]:
# Deal with the existing human datasets first

import readfcs
import numpy
import shutil
from typing import List, Dict, Optional
import pandas
import os
import re
import logging

root = r"Y:/g/data/eu59/data_flowmop"
os.chdir(root)

datasets = os.listdir(root + "/human_evals")
benchmarker = {
    "fmw" : 1,
    "rj" : 2,
    "nr" : 3,
    "sc" : 4
}

dataset_dicts = {
    "human_liver" : "human_liver",
    "t_cell" : "human_t_cell_diff",
    "lizard_blood" : "lizard_blood",
    "lizard_heart" : "lizard_heart",
    "lizard_brain" : "lizard_brain",
    "lizard_sperm" : "lizard_sperm",
    "lizard_liver" : "lizard_liver",
    "mouse_blood" : "mouse_blood",
    "mouse_spleen" : "mouse_spleen",
    "mouse_brain" : "mouse_brain",
    "mouse_cns" : "mouse_cns",
    "mouse_colon" : "mouse_colon",
    "mouse_skin" : "mouse_skin",
    "drg" : "mouse_drg",
    "mouse_neutbonemarrow" : "mouse_neutrophil",
    "mouse_bonemarrow" : "mouse_bonemarrow",
    "mouse_smallintestine" : "mouse_smallintestine",
}

In [55]:
def organize_fcs_files(
    source_dir: str,
    dest_root: str,
    datasets: Dict[str, str],
    benchmarkers: Dict[str, int]
) -> None:
    """
    Finds, parses, and organizes FCS files based on a set of rules.

    The function searches for .fcs files containing "figure_2" in their path,
    extracts metadata (dataset, benchmarker, sample number, sample type) from
    the file path and name, and copies them to a structured directory.

    Args:
        source_dir (str): The directory to search for source FCS files.
        dest_root (str): The root directory where organized files will be stored.
        datasets (Dict[str, str]): A map from path keywords to dataset names.
        benchmarkers (Dict[str, int]): A map from benchmarker acronyms to IDs.
    """
    if not os.path.exists(source_dir):
        logging.error(f"Source directory '{source_dir}' does not exist. Aborting.")
        return

    # Create a single regex pattern to find benchmarker acronyms surrounded by underscores.
    benchmarker_keys = '|'.join(benchmarkers.keys())
    benchmarker_pattern = re.compile(f"_({benchmarker_keys})_")

    logging.info(f"Starting file search in: {source_dir}")
    for dirpath, _, filenames in os.walk(source_dir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            filename = filename.lower()

            # Rule 1: Must be an FCS file and contain "figure_2" in the path.
            if not filename.lower().endswith('.fcs') or "figure_2" not in full_path.lower():
                continue
            
            # Rule 5: Determine the dataset.
            dataset_name: Optional[str] = None
            for key, value in datasets.items():
                if key in filename:
                    dataset_name = value
                    break
            
            if not dataset_name:
                logging.warning(f"Dataset not found for: {filename}")
                continue

            # Rule 2: Find the benchmarker string.
            benchmarker_match = benchmarker_pattern.search(filename)
            if not benchmarker_match:
                logging.warning(f"Benchmarker not found in: {filename}")
                continue
            benchmarker_str = benchmarker_match.group(1)
            benchmarker_id = benchmarkers[benchmarker_str]

            # Rule 3: Get the sample number (first integer found).
            sample_match = re.search(r'\d+', filename)
            if not sample_match:
                logging.warning(f"Sample number not found in: {filename}")
                continue
            sample_number = sample_match.group(0)

            # Rule 4: Determine the file type.
            sample_type: Optional[str] = None
            fn_lower = filename.lower()
            if 'debris' in fn_lower:
                sample_type = 'debris'
            elif 'time' in fn_lower:
                sample_type = 'time'
            elif 'single' in fn_lower or 'doublet' in fn_lower:
                sample_type = 'doublets'
            
            if not sample_type:
                logging.warning(f"Sample type could not be determined for: {filename}")
                continue

            # Rule 6: Construct the destination path and copy the file.
            dest_dir = os.path.join(dest_root, dataset_name, sample_type)
            os.makedirs(dest_dir, exist_ok=True)
            
            new_filename = f"benchmarker{benchmarker_id}_sample{sample_number}.fcs"
            dest_path = os.path.join(dest_dir, new_filename)

            try:
                shutil.copy(full_path, dest_path)
            except Exception as e:
                logging.error(f"Failed to copy '{full_path}' to '{dest_path}': {e}")
    logging.info("File organization process finished.")

organize_fcs_files(root + "/human_benchmarkers", root + "/human_evals", dataset_dicts, benchmarker)

INFO:root:Starting file search in: Y:/g/data/eu59/data_flowmop/human_benchmarkers
ERROR:root:Failed to copy 'Y:/g/data/eu59/data_flowmop/human_benchmarkers\nadia\figure_2_nadia_files\lizard_liver_3_NR_time.fcs' to 'Y:/g/data/eu59/data_flowmop/human_evals\lizard_liver\time\benchmarker3_sample3.fcs': 'Y:/g/data/eu59/data_flowmop/human_benchmarkers\\nadia\\figure_2_nadia_files\\lizard_liver_3_NR_time.fcs' and 'Y:/g/data/eu59/data_flowmop/human_evals\\lizard_liver\\time\\benchmarker3_sample3.fcs' are the same file
ERROR:root:Failed to copy 'Y:/g/data/eu59/data_flowmop/human_benchmarkers\nadia\figure_2_nadia_files\mouse_spleen_5_NR_debris.fcs' to 'Y:/g/data/eu59/data_flowmop/human_evals\mouse_spleen\debris\benchmarker3_sample5.fcs': 'Y:/g/data/eu59/data_flowmop/human_benchmarkers\\nadia\\figure_2_nadia_files\\mouse_spleen_5_NR_debris.fcs' and 'Y:/g/data/eu59/data_flowmop/human_evals\\mouse_spleen\\debris\\benchmarker3_sample5.fcs' are the same file
ERROR:root:Failed to copy 'Y:/g/data/eu59/

In [58]:
flowmop_dir = r"Y:/g/data/eu59/data_flowmop/seperated_flowmop_fig_2"
dataset_dir = root + "/human_evals"

In [60]:
def find_fcs_files(directory: str) -> List[str]:
    """
    Recursively find all files ending with .fcs in the given directory and its subdirectories.

    Args:
        directory (str): The root directory to search.

    Returns:
        List[str]: List of full file paths ending with .fcs.

    Example:
        >>> files = find_fcs_files("Y:/g/data/eu59/data_flowmop/seperated_flowmop_fig_2")
        >>> print(files)
    """
    fcs_files: List[str] = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.lower().endswith(".fcs"):
                fcs_files.append(os.path.join(dirpath, filename))
    return fcs_files

# Example usage:
flowmop_files = find_fcs_files(flowmop_dir)


In [61]:
for file in flowmop_files:
    file = file.lower()
    filename = os.path.basename(file)
    for key, value in dataset_dicts.items():
        if key in file:
            dataset = value
            break
    sample_int = re.search(r'\d+', filename).group()

    new_filename = f"benchmarker5_sample{sample_int}.fcs"

    if "debris" in file:
        shutil.copy(root + "/seperated_flowmop_fig_2/debrispass/" + filename, root + "/human_evals/" + dataset + "/debris/" + new_filename)
    elif "time" in file:
        shutil.copy(root + "/seperated_flowmop_fig_2/timepass/" + filename, root + "/human_evals/" + dataset + "/time/" + new_filename)
    elif "doublet" in file:
        shutil.copy(root + "/seperated_flowmop_fig_2/doubletpass/" + filename, root + "/human_evals/" + dataset + "/doublets/" + new_filename)

In [71]:
flowcut_root = r"Y:/g/data/eu59/data_flowmop/cleaned_compiled_fig_2_dataset_flowcut"
dataset_dir = root + "/human_evals"

In [73]:
for file in os.listdir(flowcut_root):
    file = file.lower()
    for key, value in dataset_dicts.items():
        if key in file:
            dataset = value
            break
    
    new_dir = os.path.join(dataset_dir, dataset+"/time/")
    sample_int = re.search(r'\d+', file).group()
    new_filename = f"benchmarker{6}_sample{sample_int}.fcs"
    shutil.copy(flowcut_root + "/" + file, new_dir + new_filename)

In [83]:
peacoqc_root = r"Y:/g/data/eu59/data_flowmop/cleaned_compiled_fig_2_dataset_peacoqc"
dataset_dir = root + "/human_evals"

In [84]:
for file in os.listdir(peacoqc_root):
    file = file.lower()
    for key, value in dataset_dicts.items():
        if key in file:
            dataset = value
            break

    
    new_dir = os.path.join(dataset_dir, dataset+"/time/")
    sample_int = re.search(r'\d+', file).group()
    new_filename = f"benchmarker{7}_sample{sample_int}.fcs"
    shutil.copy(peacoqc_root + "/" + file, new_dir + new_filename)