# MM pipeline: Run the StarDist2D segmentation model over all folders

This notebook loads a pretrained StarDist2D segmentation model and applies the segmentation prediction on all folders within the masterfolder mainf (defined in 2nd code cell). Only microscopy chamber data containing folders should be within mainf. The segmentation is applied onto all images that end with *_PH.tif* and the segmentation image is saved into a newly created folder within each image folder named *seg_sd2*. For the moment, it assumes single-page tif files and saves single-page tif files with the exact same name as the input image used for segmentation prediction.

### Load main config file. Adapt directory

In [1]:
#mainconfigname = 'jbanalysisconfig_mmrev';
mainconfigname = 'config_example_matched';
configdir = 'G://GitHub/microfluidics-image-processing/MM_pipeline';

if not mainconfigname.endswith('.json'):
    mainconfigname += '.json'
    
if not configdir.endswith('/'):
    configdir += '/'

import json
# Read JSON data
with open(configdir+mainconfigname, 'r') as file:
    data = json.load(file)

# Assign each key-value pair as a variable
for key, value in data.items():
    globals()[key] = value

### Load various packages

In [2]:
import numpy as np
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
from tifffile import imread, imwrite
from datetime import datetime
from csbdeep.utils import Path, normalize
from skimage.measure import regionprops_table
from skimage import io
from skimage import segmentation
from skimage import color
from stardist.matching import matching_dataset
from stardist import fill_label_holes, random_label_cmap, relabel_image_stardist, calculate_extents, gputools_available, _draw_polygons
from stardist.models import Config2D, StarDist2D, StarDistData2D
import os
from tensorflow import keras
import cv2
import pandas as pd
from datetime import date
import re
from pathlib import Path
np.random.seed(42)
lbl_cmap = random_label_cmap()

def add_prefix(props, prefix):
    return {f"{prefix}_{key}" if 'intensity' in key else key: value for key, value in props.items()}

### Check if GPU can be accessed

In [3]:
gputools_available()

__init__.py (276): Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.


If you want to compute separable approximations, please install it with
pip install scikit-tensor-py3


True

### Load in meta file and display head. Check if correct

In [4]:
meta = pd.read_csv(os.path.join(masterdir,metacsv), dtype={'stardist': str})
replicates = meta.replicate.unique()
meta.tail()

Unnamed: 0,date,replicate,chip,pos,channel,Process,replicate2,Process2,rep2startdifferencemin,rep2firstframe,...,StageY,PxinUmX,PxinUmY,register,stardist,stardist_data,stardist_data_cor,stardist_fails,delta,delta_fails
15,NaT,r10,c1,5,2,3818,r10b,3931.0,228.0,37.0,...,-49316.403313,0.065,0.065,Done,,,,,,
16,NaT,r11,c1,5,1,4014,r11b,4046.0,50.0,8.0,...,-43571.361981,0.065,0.065,Done,,,,,,
17,NaT,r11,c1,17,3,4026,r11b,4058.0,50.0,8.0,...,-39732.806793,0.065,0.065,Done,,,,,,
18,NaT,r11,c1,26,4,4036,r11b,4068.0,50.0,8.0,...,-37792.564308,0.065,0.065,Done,,,,,,
19,NaT,r13,c1,16,6,2567,,,,,...,-37932.46,0.065,0.065,Done,,,,,,


### Load stardist model
Here, the model is loaded. You need to specify the dir which contains a folder named *stardist* in the config file. This *stardist* folder needs to contain the files *weigths_best.h5* as well as the *config.json* and optionally the *thresholds.json*

In [5]:
print(stardistmodeldir)
model = StarDist2D(None, name='stardist', basedir=stardistmodeldir)
axis_norm = (0,1)   # normalize channels independently

G://GitHub/microfluidics-image-processing/stardist_models/mm
Loading network weights from 'weights_best.h5'.
Loading thresholds from 'thresholds.json'.
Using default values: prob_thresh=0.586968, nms_thresh=0.3.


### Define regionprops parameters. You could add more if you want to

In [6]:
if n_channel>1:
    flims = True;
    prop_list = ['label', 
                'area', 'centroid', 
                'axis_major_length', 'axis_minor_length',
                 'eccentricity',
                'intensity_mean', 'intensity_max']
else:
    flims = False;
    prop_list = ['label', 
                'area', 'centroid', 
                'axis_major_length', 'axis_minor_length',
                 'eccentricity'] 

### Limit GPU RAM usage by StarDist

In [7]:
from csbdeep.utils.tf import limit_gpu_memory
# adjust as necessary: limit GPU memory to be used by TensorFlow to leave some to OpenCL-based computations
limit_gpu_memory(fraction=ramlimit, total_memory=ramsize)
# alternatively, try this:
# limit_gpu_memory(None, allow_growth=True)

Virtual devices cannot be modified after being initialized


## Main segmentation loop
This loop goes over each row in the meta file which is marked with completed preprocessing (Progress == 'Done') and applies the StarDist segmentation model to each position/chamber iteratively. For the moment, not paralellized but could probably benefit from that.

In [8]:
# Patch Keras model's predict to always use verbose=0
orig_predict = model.keras_model.predict
def predict_no_verbose(*args, **kwargs):
    kwargs['verbose'] = 0
    return orig_predict(*args, **kwargs)
model.keras_model.predict = predict_no_verbose

In [9]:
# helper to robustly extract trailing integer from a path basename
def extract_trailing_int_from_basename(path):
    name = os.path.basename(path)
    m = re.search(r'(\d+)$', name)
    return int(m.group(1)) if m else None

In [10]:
# Fixed version of the processing loop to correctly extract pos and chamber numbers
# - Removed the incorrect "+1" adjustment that caused Pos20 -> Pos21 in filenames/columns
# - Robustly extracts trailing integers from folder basenames (handles Pos5, Pos05, Pos20, etc.)
# - Uses the chamber folder as the 'folder' column so the source is exact
# - Keeps the rest of your logic intact (stardist prediction, saving per-chamber CSVs)
#
# Replace your original loop with this code (or drop it into your script),
# it only changes how pos/chamber are derived and what 'folder' is saved.

# --- begin processing (replace your loop body) ---
for i in range(0, meta.shape[0]):
    # reload metadata each iteration if you need the file updated by others,
    # otherwise you can read once before the loop for speed.
    meta = pd.read_csv(os.path.join(masterdir, metacsv), dtype={'stardist': str, 'stardist_data': str})

    if meta.loc[i, 'stardist'] == 'Done' or meta.loc[i, 'Exclude'] == 'excl' or not meta.loc[i, ('register')] == 'Done':
        continue

    main_folder = os.path.join(masterdir, savedirname, meta.replicate[i], 'Chambers')
    save_directory = os.path.join(main_folder, 'stardist2')
    os.makedirs(save_directory, exist_ok=True)

    current_directory = os.path.join(main_folder, f'Pos{str(meta.pos[i]).zfill(2)}')
    if not os.path.exists(current_directory):
        continue

    chambf = [f.path for f in os.scandir(current_directory) if f.is_dir()]
    chambf = [k for k in chambf if 'Chamb' in k]

    fails = []

    for chambi in tqdm(range(0, len(chambf)), desc=meta.replicate[i] + ', Pos ' + str(meta.pos[i]).zfill(2)):
        inputs_folder = chambf[chambi]
        outputs_folder = os.path.join(inputs_folder, "seg_sd2")
        os.makedirs(outputs_folder, exist_ok=True)
        for file in Path(outputs_folder).glob('*tif'):
            os.remove(file)

        images = sorted(Path(inputs_folder).glob('*Ch1*tif'))
        if flims:
            images_fl = sorted(Path(inputs_folder).glob('*Ch2*tif'))
            if n_channel > 2:
                images_fl2 = sorted(Path(inputs_folder).glob('*Ch3*tif'))

        max_frame = meta.loc[i, 'MaxFr']
        frame_list = range(len(images))

        # ----> Create a DataFrame for this chamber
        chamber_frames_df = None

        # derive pos and chamber numbers robustly from folder names (no +1)
        # If you want 1-based numbering, adjust here, but do so consciously.
        pos_number = extract_trailing_int_from_basename(current_directory)
        chamb_number = extract_trailing_int_from_basename(inputs_folder)

        for frame_index in frame_list:
            try:
                if flims:
                    fluorescence_image = imread(images_fl[frame_index])
                    if n_channel > 2:
                        fluorescence_image2 = imread(images_fl2[frame_index])

                main_image = imread(images[frame_index])
                normalized_image = normalize(main_image, 1, 99.8, axis=axis_norm)
                labels, details = model.predict_instances(normalized_image, verbose=0)
                filename_segmentation = os.path.join(outputs_folder, os.path.basename(images[frame_index]))
                imwrite(filename_segmentation, labels, append=False, metadata=None)

                region_props = regionprops_table(labels, intensity_image=fluorescence_image if flims else None, properties=prop_list)
                if flims and n_channel > 2:
                    region_props = add_prefix(region_props, 'fluor1')
                    region_props2 = regionprops_table(labels, intensity_image=fluorescence_image2, properties=prop_list)
                    region_props2 = add_prefix(region_props2, 'fluor2')
                    for key, value in region_props2.items():
                        if 'intensity' in key:
                            region_props[key] = value

                region_props_df = pd.DataFrame(region_props)

                # Insert columns with correct values (no erroneous +1)
                region_props_df.insert(0, 'frame', frame_index + 1)  # keep frames 1-based if desired
                # use pos_number and chamb_number extracted from folder names
                region_props_df.insert(0, 'pos', pos_number if pos_number is not None else meta.pos[i])
                region_props_df.insert(0, 'replicate', meta.replicate[i])
                # insert chamber after replicate and pos to keep a similar layout as before
                region_props_df.insert(2, 'chamber', chamb_number if chamb_number is not None else os.path.basename(inputs_folder))
                # use the actual chamber folder as the folder column (more precise)
                region_props_df['folder'] = inputs_folder

                if chamber_frames_df is None:
                    chamber_frames_df = region_props_df
                else:
                    chamber_frames_df = pd.concat([chamber_frames_df, region_props_df], ignore_index=True)

            except Exception as e:
                fails.append(f"Error processing folder {current_directory}, Chamber {inputs_folder}, Frame {frame_index}: {e}")

        # ----> Save the DataFrame for this chamber after all frames are processed
        if chamber_frames_df is not None:
            # format csv filename using the extracted integers (no +1)
            if pos_number is None:
                pos_str = str(int(meta.pos[i])).zfill(2)
            else:
                pos_str = str(int(pos_number)).zfill(2)
            if chamb_number is None:
                # fallback: extract last two chars then zfill
                chamb_str = os.path.basename(inputs_folder)[-2:].zfill(2)
            else:
                chamb_str = str(int(chamb_number)).zfill(2)

            csv_filename = f"Pos{pos_str}Chamb{chamb_str}.csv"
            chamber_frames_df.to_csv(os.path.join(save_directory, csv_filename), index=False)

    # ----> Update metadata as before
    meta = pd.read_csv(os.path.join(masterdir, metacsv), dtype={'stardist': str})
    meta.at[i, 'stardist'] = 'Done'
    if fails:
        meta.at[i, 'stardist_fails'] = '; '.join(fails)
    meta.to_csv(os.path.join(masterdir, metacsv), index=False)
# --- end processing ---

r07, Pos 02: 100%|██████████| 23/23 [26:23<00:00, 68.85s/it]  


ValueError: could not convert string to float: 'Done'

#DONE

below is a section to ad hoc run the data extraction from the segmented images

In [None]:
meta = pd.read_csv(os.path.join(masterdir, metacsv), dtype={'stardist_data': str})

for i in range(meta.shape[0]):
    meta = pd.read_csv(os.path.join(masterdir, metacsv), dtype={'stardist_data': str})
    if meta.loc[i, 'stardist_data'] == 'Done' or meta.loc[i, 'Exclude'] == 'excl' or not meta.loc[i, 'register'] == 'Done':
        continue

    # Directory setup for current experiment
    main_folder = os.path.join(masterdir, savedirname, meta.replicate[i], 'Chambers')
    save_directory = os.path.join(main_folder, 'stardist2')
    os.makedirs(save_directory, exist_ok=True)

    current_directory = os.path.join(main_folder, f'Pos{str(meta.pos[i]).zfill(2)}')
    if not os.path.exists(current_directory):
        continue

    chambf = [f.path for f in os.scandir(current_directory) if f.is_dir()]
    chambf = [k for k in chambf if 'Chamb' in k]

    fails = []

    for chambi in tqdm(range(0, len(chambf)), desc=meta.replicate[i] + ', Pos ' + str(meta.pos[i]).zfill(2)):
        inputs_folder = chambf[chambi]
        seg_outputs_folder = os.path.join(inputs_folder, "seg_sd2")
        if not os.path.exists(seg_outputs_folder):
            fails.append(f"Missing segmentation folder: {seg_outputs_folder}")
            continue

        seg_images = sorted(Path(seg_outputs_folder).glob('*.tif'))
        if len(seg_images) == 0:
            fails.append(f"No segmentation images in {seg_outputs_folder}")
            continue

        # Optional: fluorescence images (if needed; adjust as per your setup)
        if flims:
            images_fl = sorted(Path(inputs_folder).glob('*Ch2*tif'))
            if n_channel > 2:
                images_fl2 = sorted(Path(inputs_folder).glob('*Ch3*tif'))

        # ----> Chamber-level DataFrame
        chamber_frames_df = None

        for frame_index, seg_path in enumerate(seg_images):
            try:
                labels = imread(seg_path)
                if flims:
                    fluorescence_image = imread(images_fl[frame_index])
                    if n_channel > 2:
                        fluorescence_image2 = imread(images_fl2[frame_index])

                region_props = regionprops_table(
                    labels,
                    intensity_image=fluorescence_image if flims else None,
                    properties=prop_list
                )
                # If >2 channels, handle as before (merge dictionaries, etc)

                region_props_df = pd.DataFrame(region_props)
                # 2-digit, 1-based indices for pos and chamber
                pos_str = str(int(os.path.basename(current_directory)[-2:]) + 1).zfill(2)
                chamb_str = str(int(os.path.basename(inputs_folder)[-2:]) + 1).zfill(2)

                region_props_df.insert(0, 'frame', frame_index + 1)
                region_props_df.insert(0, 'pos', int(pos_str))
                region_props_df.insert(0, 'replicate', meta.replicate[i])
                region_props_df.insert(2, 'chamber', int(chamb_str))
                region_props_df['folder'] = current_directory

                if chamber_frames_df is None:
                    chamber_frames_df = region_props_df
                else:
                    chamber_frames_df = pd.concat([chamber_frames_df, region_props_df], ignore_index=True)

            except Exception as e:
                fails.append(f"Error processing {seg_path}: {e}")

        # ----> Save DataFrame for this chamber after all frames are processed
        if chamber_frames_df is not None:
            csv_filename = f"Pos{pos_str}Chamb{chamb_str}.csv"
            chamber_frames_df.to_csv(os.path.join(save_directory, csv_filename), index=False)

    # Update metadata
    meta = pd.read_csv(os.path.join(masterdir, metacsv), dtype={'stardist_data': str})
    meta.at[i, 'stardist_data'] = 'Done'
    if fails:
        meta.at[i, 'stardist_fails'] = '; '.join(fails)
    meta.to_csv(os.path.join(masterdir, metacsv), index=False)

# DONE

# CORRECT MY WRONG LABELLING!

In [None]:
# logging_notebook_snippet.py
# Helper to configure the "fix_stardist" logger for Jupyter Lab (stdout + file)

import logging
import os
import sys
from logging.handlers import RotatingFileHandler
import datetime

def iso_ts():
    return datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

def configure_fix_stardist_logger(masterdir,
                                  level=logging.INFO,
                                  to_file=True,
                                  maxBytes=5*1024*1024,
                                  backupCount=5,
                                  logger_name="fix_stardist"):
    """
    Configure the logger used in the fixer so INFO+ messages are visible in Jupyter cell output
    and optionally saved to a timestamped log file under masterdir/stardist_fix_logs/.
    This removes any existing handlers from the named logger (so previous Jupyter or library
    handlers don't interfere), sets propagate=False, and returns the configured logger.

    Parameters:
      - masterdir: base directory where log folder will be created (used only if to_file=True)
      - level: logging level (logging.INFO recommended)
      - to_file: whether to create a rotating file handler in masterdir/stardist_fix_logs
      - maxBytes, backupCount: rotating file handler params
      - logger_name: name of the logger to configure (default 'fix_stardist')
    """
    log = logging.getLogger(logger_name)

    # Remove existing handlers so previous configuration doesn't interfere
    for h in list(log.handlers):
        log.removeHandler(h)
    # Prevent message propagation to root logger which might filter INFO
    log.propagate = False

    # Set level
    log.setLevel(level)

    # Stream handler to stdout so messages appear in notebook output
    sh = logging.StreamHandler(stream=sys.stdout)
    sh.setLevel(level)
    sh_formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
    sh.setFormatter(sh_formatter)
    log.addHandler(sh)

    if to_file:
        # Ensure directory exists
        log_dir = os.path.join(masterdir, "stardist_fix_logs")
        os.makedirs(log_dir, exist_ok=True)
        log_filename = os.path.join(log_dir, f"fix_stardist_{iso_ts()}.log")
        fh = RotatingFileHandler(log_filename, maxBytes=maxBytes, backupCount=backupCount, encoding="utf-8")
        fh.setLevel(level)
        fh.setFormatter(sh_formatter)
        log.addHandler(fh)
        log.info(f"Logging started. File: {log_filename}")

    # show handler info for quick confirmation in notebook
    def _show_handlers():
        print(f"Logger '{logger_name}' configured. Level: {logging.getLevelName(log.level)}")
        for i, h in enumerate(log.handlers):
            fname = getattr(h, "baseFilename", None)
            print(f" Handler {i}: {h.__class__.__name__}, level={logging.getLevelName(h.level)}, file={fname}")
    _show_handlers()

    return log

In [None]:
# configure logging to both console and a timestamped file
import logging
from logging.handlers import RotatingFileHandler
import os

# place this after you define iso_ts() so iso_ts() can be used for filename
log_dir = os.path.join(masterdir, "stardist_fix_logs")
os.makedirs(log_dir, exist_ok=True)
log_filename = os.path.join(log_dir, f"fix_stardist_{iso_ts()}.log")

# create logger
log = logging.getLogger("fix_stardist")
log.setLevel(logging.INFO)

# remove any existing handlers (useful if reloading in notebook)
for h in list(log.handlers):
    log.removeHandler(h)

# console handler (stream)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

# file handler (rotating, keep up to 5 files of 5MB each)
fh = RotatingFileHandler(log_filename, maxBytes=5*1024*1024, backupCount=5, encoding="utf-8")
fh.setLevel(logging.INFO)

# formatter
fmt = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
ch.setFormatter(fmt)
fh.setFormatter(fmt)

# attach handlers
log.addHandler(ch)
log.addHandler(fh)

log.info(f"Logging started. Log file: {log_filename}")

In [None]:
logger = configure_fix_stardist_logger(masterdir=r"E:\Julian\agr_rev_matched", level=logging.INFO, to_file=True)

In [None]:
#!/usr/bin/env python3
"""
Scan all replicates in meta and fix Pos/Chamber labeling mistakes in CSVs under each replicate's
Chambers/stardist2 folder.

This version updates warning messages to include the current replicate ID so warnings like:
  "CSV Pos38Chamb36.csv is empty (no rows). Skipping correction for safety."
become:
  "replicate r05 WARNING: CSV Pos38Chamb36.csv is empty (no rows). Skipping correction for safety."

Behavior (summary):
 - iterate per replicate found in the meta CSV
 - inspect all CSVs in <masterdir>/<savedirname>/<replicate>/Chambers/stardist2
 - derive authoritative Pos from the 'folder' column if present; otherwise fall back
 - fix chamber by removing the previous +1 error when folder lacks chamber info
 - ensure 'folder' column contains ...\PosXX\ChambYY
 - write corrected CSVs into stardist2/corrected then move them back with conflict handling
 - update meta.stardist_data_cor='Done' for rows with that replicate where stardist_data == 'Done'
 - improved logging: warnings include replicate id
"""

import os
import re
import shutil
import hashlib
import datetime
import logging
from pathlib import Path
import pandas as pd

# ------------- Configuration (edit to match your environment) -------------
#masterdir = r"E:\Julian\agr_rev_matched"   # path to master directory
#metacsv = "matched_meta_processing.csv"                       # metadata filename (inside masterdir)
#savedirname = "savesV1"                    # under masterdir/<savedirname>/<replicate>/Chambers
# ----------------------------------------------------------------------------

# Metadata column names (adjust if your meta uses different names)
COL_REPLICATE = "replicate"
COL_POS = "pos"
COL_STARDIST_DATA = "stardist_data"
COL_STARDIST_DATA_COR = "stardist_data_cor"

# Logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
log = logging.getLogger("fix_stardist")

# Helpers
def iso_ts():
    return datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

def hash_file(path, chunk_size=65536):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()

def extract_pos_from_path(folder_path):
    if not folder_path or not isinstance(folder_path, str):
        return None
    # look for 'Pos' followed by digits
    m = re.search(r'Pos0*([0-9]+)', folder_path, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))
    # fallback: any trailing number in basename
    base = os.path.basename(folder_path)
    m2 = re.search(r'(\d+)$', base)
    if m2:
        return int(m2.group(1))
    return None

def extract_chamb_from_path(folder_path):
    if not folder_path or not isinstance(folder_path, str):
        return None
    m = re.search(r'Chamb0*([0-9]+)', folder_path, flags=re.IGNORECASE)
    if m:
        return int(m.group(1))
    return None

def extract_pos_from_filename(fname):
    m = re.search(r'Pos0*([0-9]+)', fname, flags=re.IGNORECASE)
    return int(m.group(1)) if m else None

def extract_chamb_from_filename(fname):
    m = re.search(r'Chamb0*([0-9]+)', fname, flags=re.IGNORECASE)
    return int(m.group(1)) if m else None

def safe_makedirs(path):
    os.makedirs(path, exist_ok=True)

def normalize_folder_sep(p):
    if not isinstance(p, str):
        return p
    return p.replace("/", os.sep).replace("\\", os.sep)

# The main function
def fix_by_replicate(masterdir, metacsv, savedirname, dry_run=False):
    meta_path = os.path.join(masterdir, metacsv)
    if not os.path.exists(meta_path):
        raise FileNotFoundError(f"Metadata file not found: {meta_path}")
    meta = pd.read_csv(meta_path, dtype=str).fillna("")
    if COL_REPLICATE not in meta.columns:
        raise KeyError(f"Expected metadata column '{COL_REPLICATE}' not found.")

    replicates = sorted(meta[COL_REPLICATE].unique())
    log.info(f"Found {len(replicates)} unique replicate(s) in metadata.")

    overall_corrections = []
    processed_reps = set()

    for rep in replicates:
        if not rep:
            continue
        log.info(f"Processing replicate: {rep}")
        stardist2 = os.path.join(masterdir, savedirname, rep, "Chambers", "stardist2")
        if not os.path.isdir(stardist2):
            log.info(f"  No stardist2 folder for replicate {rep} at {stardist2}, skipping.")
            continue

        corrected_dir = os.path.join(stardist2, "corrected")
        safe_makedirs(corrected_dir)

        csv_files = list(Path(stardist2).glob("*.csv"))
        # Exclude anything already in corrected (just in case)
        csv_files = [p for p in csv_files if str(p.parent) != str(Path(corrected_dir))]
        log.info(f"  Found {len(csv_files)} CSV(s) to inspect in {stardist2}.")

        rep_corrections = 0
        for csv_path in csv_files:
            csv_path = Path(csv_path)
            fname = csv_path.name
            log.info(f"    Inspecting file: {fname}")
            try:
                df = pd.read_csv(csv_path)
            except Exception as e:
                log.error(f"{rep} ERROR: reading {fname}: {e}")
                overall_corrections.append((rep, fname, None, f"read_error:{e}"))
                continue

            # If dataframe is empty, skip processing to avoid iloc[0] errors.
            if df.empty:
                log.warning(f"{rep} WARNING: CSV {fname} is empty (no rows). Skipping correction for safety.")
                overall_corrections.append((rep, fname, None, "empty_csv_skipped"))
                continue

            # find folder column authoritative value (first non-empty)
            folder_val = None
            if "folder" in df.columns:
                non_empty = df['folder'].dropna().astype(str)
                non_empty = non_empty[non_empty.str.strip() != ""]
                if len(non_empty) > 0:
                    folder_val = non_empty.iloc[0]
            if folder_val:
                folder_val = normalize_folder_sep(folder_val)

            # extract positions/chambers
            pos_from_folder = extract_pos_from_path(folder_val)
            chamb_from_folder = extract_chamb_from_path(folder_val)

            pos_from_fname = extract_pos_from_filename(fname)
            chamb_from_fname = extract_chamb_from_filename(fname)

            # also look into columns if present (safe because df is not empty)
            pos_from_col = None
            chamb_from_col = None
            if 'pos' in df.columns:
                try:
                    pos_from_col = int(df['pos'].iloc[0])
                except Exception:
                    pos_from_col = None
            if 'chamber' in df.columns:
                try:
                    chamb_from_col = int(df['chamber'].iloc[0])
                except Exception:
                    chamb_from_col = None

            log.info(
                f"      extracted -> folder pos: {pos_from_folder}, folder chamb: {chamb_from_folder}, "
                f"fname pos: {pos_from_fname}, fname chamb: {chamb_from_fname}, "
                f"column pos: {pos_from_col}, column chamb: {chamb_from_col}"
            )

            # Decide corrected pos
            corrected_pos = None
            if pos_from_folder is not None:
                corrected_pos = pos_from_folder
            elif pos_from_col is not None:
                corrected_pos = pos_from_col
            elif pos_from_fname is not None:
                corrected_pos = pos_from_fname

            if corrected_pos is None:
                log.warning(f"{rep} WARNING: Could not determine Pos for {fname}; skipping.")
                overall_corrections.append((rep, fname, None, "no_pos_determined"))
                continue

            # Decide corrected chamber
            corrected_chamb = None
            # Prefer folder chamb if present
            if chamb_from_folder is not None:
                corrected_chamb = chamb_from_folder
            else:
                # If folder lacks chamber but filename or column has it, handle the common +1 error.
                source_ch = None
                src = None
                if chamb_from_col is not None:
                    source_ch = chamb_from_col
                    src = "col"
                elif chamb_from_fname is not None:
                    source_ch = chamb_from_fname
                    src = "fname"

                if source_ch is None:
                    # no chamber information anywhere; skip
                    log.warning(f"{rep} WARNING: No chamber info found for {fname}; skipping.")
                    overall_corrections.append((rep, fname, None, "no_chamber_info"))
                    continue

                # If source_ch likely has +1 error, decrement by 1; otherwise keep it.
                # Heuristic: user reported systemic +1 error -> decrement source by 1 (conservative: only if > 0)
                if source_ch > 0:
                    log.info(f"      Using chamber from {src}: {source_ch}. Applying -1 fix because folder lacks chamber info.")
                    corrected_chamb = source_ch - 1
                    if corrected_chamb < 0:
                        corrected_chamb = 0
                else:
                    corrected_chamb = source_ch

            if corrected_chamb is None:
                log.warning(f"{rep} WARNING: Could not determine corrected chamber for {fname}; skipping.")
                overall_corrections.append((rep, fname, None, "no_corrected_chamber"))
                continue

            # Ensure corrected_chamb and corrected_pos are ints
            try:
                corrected_chamb = int(corrected_chamb)
                corrected_pos = int(corrected_pos)
            except Exception as e:
                log.error(f"{rep} ERROR: converting corrected pos/chamber to int for {fname}: {e}")
                overall_corrections.append((rep, fname, None, f"int_conv_error:{e}"))
                continue

            # Compose new filename
            new_fname = f"Pos{str(corrected_pos).zfill(2)}Chamb{str(corrected_chamb).zfill(2)}.csv"
            new_path = os.path.join(corrected_dir, new_fname)

            # Update dataframe pos/chamber columns
            df['pos'] = corrected_pos
            df['chamber'] = corrected_chamb

            # Update folder column to include \ChambNN if not present (and we have a folder base)
            if folder_val:
                pos_match = re.search(r'(.*Pos0*[0-9]+)', folder_val, flags=re.IGNORECASE)
                if pos_match:
                    base_pos_path = pos_match.group(1)
                else:
                    base_pos_path = folder_val
                desired_folder = os.path.join(base_pos_path, f"Chamb{str(corrected_chamb).zfill(2)}")
                desired_folder = normalize_folder_sep(desired_folder)
                df['folder'] = desired_folder
                log.info(f"      Set folder to: {desired_folder}")
            else:
                plausible = os.path.join(masterdir, savedirname, rep, "Chambers", f"Pos{str(corrected_pos).zfill(2)}", f"Chamb{str(corrected_chamb).zfill(2)}")
                plausible = normalize_folder_sep(plausible)
                df['folder'] = plausible
                log.info(f"      No folder in CSV; set folder to plausible path: {plausible}")

            # Save corrected CSV into corrected_dir
            try:
                if dry_run:
                    log.info(f"{rep} [DRY RUN] Would save corrected file to {new_path} and would remove original {csv_path}")
                    overall_corrections.append((rep, fname, new_fname, "dry_run_saved"))
                else:
                    df.to_csv(new_path, index=False)
                    # remove original
                    try:
                        os.remove(csv_path)
                    except Exception as e:
                        log.warning(f"{rep} WARNING: Could not delete original file {csv_path}: {e}")
                    log.info(f"      Saved corrected file as {new_fname} (original {fname} removed).")
                    overall_corrections.append((rep, fname, new_fname, "renamed"))
                    rep_corrections += 1
            except Exception as e:
                log.error(f"{rep} ERROR: saving corrected file {new_fname}: {e}")
                overall_corrections.append((rep, fname, new_fname, f"save_error:{e}"))
                continue

        # After processing all CSVs in replicate, move corrected files back into stardist2 resolving conflicts
        if not dry_run:
            corrected_files = list(Path(corrected_dir).glob("*.csv"))
            if corrected_files:
                log.info(f"  Resolving {len(corrected_files)} corrected file(s) back into {stardist2} (replicate {rep}).")
            for cfile in corrected_files:
                target = os.path.join(stardist2, cfile.name)
                try:
                    if os.path.exists(target):
                        h_corr = hash_file(cfile)
                        h_targ = hash_file(target)
                        if h_corr == h_targ:
                            log.info(f"    Target {cfile.name} already exists and is identical. Deleting corrected duplicate.")
                            cfile.unlink()
                        else:
                            bak_name = f"{os.path.basename(target)}.bak_{iso_ts()}"
                            bak_path = os.path.join(stardist2, bak_name)
                            log.info(f"    Conflict: {cfile.name} differs from existing {target}. Backing up existing -> {bak_name} and replacing.")
                            shutil.move(target, bak_path)
                            shutil.move(str(cfile), target)
                    else:
                        shutil.move(str(cfile), target)
                        log.info(f"    Moved corrected file into place: {cfile.name}")
                except Exception as e:
                    log.error(f"{rep} ERROR: moving corrected file {cfile.name} into place: {e}")
                    overall_corrections.append((rep, cfile.name, os.path.basename(target), f"move_error:{e}"))
                    continue

            # remove corrected_dir if empty
            try:
                if os.path.isdir(corrected_dir) and not any(Path(corrected_dir).iterdir()):
                    os.rmdir(corrected_dir)
                    log.info(f"  Removed empty corrected directory: {corrected_dir}")
            except Exception as e:
                log.warning(f"{rep} WARNING: Could not remove corrected directory {corrected_dir}: {e}")

        processed_reps.add(rep)

        log.info(f"Finished processing replicate {rep}. Corrections applied: {rep_corrections}")

        # Update metadata: mark rows for this replicate as corrected where stardist_data == 'Done'
        if not dry_run:
            mask = (meta[COL_REPLICATE] == rep) & (meta.get(COL_STARDIST_DATA, "") == "Done")
            if mask.any():
                meta.loc[mask, COL_STARDIST_DATA_COR] = "Done"
                log.info(f"  Updated metadata stardist_data_cor='Done' for {mask.sum()} row(s) in replicate {rep}.")

    # Save updated metadata
    if not dry_run:
        meta.to_csv(meta_path, index=False)
        log.info(f"Saved updated metadata to {meta_path}")

    # Final summary
    log.info("Processing complete.")
    log.info(f"Total replicates processed: {len(processed_reps)}")
    log.info(f"Total actions recorded: {len(overall_corrections)}")
    for rec in overall_corrections:
        log.info(f"  RECORD: {rec}")

    return overall_corrections

if __name__ == "__main__":
    # Example run: set dry_run=True to preview actions without changing files
    corrections = fix_by_replicate(masterdir=masterdir, metacsv=metacsv, savedirname=savedirname, dry_run=False)
    print(f"Done. Total records: {len(corrections)}")

In [None]:
corrections = fix_by_replicate(masterdir=masterdir, metacsv=metacsv, savedirname=savedirname, dry_run=True)

In [None]:
corrections = fix_by_replicate(masterdir=masterdir, metacsv=metacsv, savedirname=savedirname, dry_run=False)
print(f"Done. Total records: {len(corrections)}")

DONE?