In [None]:
# ============================================================================
# CONFIGURATION
# This cell contains all the settings a user needs to change.
# ============================================================================

# --- PATHS ---
# TODO: Update these paths to match your file locations.
INPUT_FILE_PATH = "data/your_strict_threshold_data.xlsx"
OUTPUT_DIR = "output/imputed_data"
OUTPUT_FILENAME = "Ground_Truth_Imputed.xlsx"

# --- OPTIONS ---
# Set to True to remove T0 samples (TM2A1_y and TM2An1_y).
# Set to False to keep them and include them in the imputation process.
REMOVE_T0_SAMPLES = True

In [None]:
# ============================================================================
# SCRIPT LOGIC
# A user typically does not need to edit this cell.
# ============================================================================
import pandas as pd
import numpy as np
import re
import os
from typing import Union

def run_imputation_pipeline(input_file: str, output_dir: str, output_filename: str, remove_t0: bool):
    """
    Loads, optionally removes T0 samples, and imputes missing values within replicate groups.

    Args:
        input_file (str): Path to the input Excel file.
        output_dir (str): Path to the directory for saving the output.
        output_filename (str): Name for the output Excel file.
        remove_t0 (bool): If True, T0 samples are removed before imputation.
    """
    print(f"--- Starting Imputation Pipeline ---")
    
    # --- 1. Load Data ---
    if not os.path.exists(input_file):
        print(f"ERROR: File not found at '{input_file}'.")
        return

    try:
        df = pd.read_excel(input_file, na_values=['', ' ', '-', '#N/A', 'NULL', 'null'])
        print(f"Successfully loaded data from '{os.path.basename(input_file)}'. Shape: {df.shape}")
    except Exception as e:
        print(f"Error loading Excel file: {e}")
        return

    processed_df = df.copy()

    # --- 2. (Optional) Remove T0 Columns ---
    if remove_t0:
        print("\nStep 1: Removing T0 samples...")
        # Regex: Matches 'TM2A1_y' or 'TM2An1_y' where y is 1-5.
        remove_t0_pattern = re.compile(r'TM2(A1|An1)_[1-5]$', re.IGNORECASE)
        columns_to_remove = [col for col in processed_df.columns if remove_t0_pattern.search(col)]

        if columns_to_remove:
            processed_df = processed_df.drop(columns=columns_to_remove)
            print(f"  Removed {len(columns_to_remove)} T0 columns. New shape: {processed_df.shape}")
        else:
            print("  No T0 columns found to remove.")
    else:
        print("\nStep 1: Skipping T0 sample removal as per configuration.")

    # --- 3. Impute Missing Values ---
    print("\nStep 2: Imputing missing values within replicate groups...")
    timepoint_groups = {}
    # Regex to capture names like 'TM2A2_5', 'TM2An3_5' for grouping
    timepoint_regex = re.compile(r'(TM2[A-Za-z]+\d+)_[1-5]$', re.IGNORECASE)

    for col in processed_df.columns:
        match = timepoint_regex.match(str(col)) # Ensure col is string
        if match:
            group_id = match.group(1)
            if group_id not in timepoint_groups:
                timepoint_groups[group_id] = []
            timepoint_groups[group_id].append(col)

    if not timepoint_groups:
        print("  WARNING: No timepoint groups found for imputation. Check column names.")
        return

    print(f"  Identified {len(timepoint_groups)} groups for imputation.")
    nan_before = processed_df.isna().sum().sum()
    print(f"  NaN count before imputation: {nan_before}")

    # Perform imputation for each group
    for group_id, cols_in_group in timepoint_groups.items():
        if not cols_in_group:
            continue
        
        # Select the columns for the current group
        group_df = processed_df[cols_in_group].apply(pd.to_numeric, errors='coerce')
        # Calculate row-wise mean for the group, ignoring existing NaNs
        row_means = group_df.mean(axis=1, skipna=True)
        # Impute by filling NaNs in the group's columns with the calculated row means
        processed_df[cols_in_group] = group_df.T.fillna(row_means).T

    nan_after = processed_df.isna().sum().sum()
    print(f"  NaN count after imputation: {nan_after}")
    if nan_after < nan_before:
        print("  Imputation successfully reduced missing values.")
    elif nan_after > 0:
        print("  NOTE: Some NaNs may remain if all replicates in a group were missing for a given row.")

    # --- 4. Save Results ---
    print("\nStep 3: Saving processed data...")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_filename)
    
    try:
        processed_df.to_excel(output_path, index=False)
        print(f"  Successfully saved imputed data to: '{output_path}'")
    except Exception as e:
        print(f"ERROR: Could not save the output file. Details: {e}")
    
    print("\n--- Pipeline Finished ---")

In [None]:
# ============================================================================
# EXECUTION
# This cell calls the main function using the variables from the Configuration cell.
# ============================================================================

run_imputation_pipeline(
    input_file=INPUT_FILE_PATH,
    output_dir=OUTPUT_DIR,
    output_filename=OUTPUT_FILENAME,
    remove_t0=REMOVE_T0_SAMPLES
)