In [1]:
# ============================================================================
# Cell 1: CONFIGURATION
# This cell contains all the settings a user needs to change.
# ============================================================================

# --- 1. PATHS ---
# Update these paths to match your file locations.
INPUT_FILE_PATH = "/users/aranpurdy/desktop/cfps/PCA/rf/MOD_RF_Imputed.xlsx"
OUTPUT_DIR = "/users/aranpurdy/desktop/TEST/IMPUTETEST"
OUTPUT_FILENAME = "Ground_Truth_Imputed.xlsx"

# --- 2. SAMPLE NAMING CONVENTION ---
# Define how your sample replicates are grouped.
# This regex must contain ONE capturing group (...) that isolates the unique identifier for a replicate group.
# Example: For a sample named 'TM2A2_3', the part 'TM2A2' identifies the group. The regex below captures this part.
# Example 2: For a sample named 'Control-T2-Rep3', you might use:
#            REPLICATE_GROUP_REGEX = r'^(Control-T2)-'
#
REPLICATE_GROUP_REGEX = r'^(TM2[A-Za-z]+\d+)_'


# --- 3. OPTIONS ---
# Set to True to remove T0 samples before imputation.
REMOVE_T0_SAMPLES = True

# If REMOVE_T0_SAMPLES is True, list the group identifiers that correspond
# to T0. These are the parts captured by the regex above.
T0_GROUP_IDENTIFIERS = [
    'TM2A1',
    'TM2An1'
]

print("--- Configuration Loaded ---")

--- Configuration Loaded ---


In [2]:
# ============================================================================
# Cell 2: SCRIPT LOGIC
# A user typically does not need to edit this cell.
# ============================================================================
import pandas as pd
import numpy as np
import re
import os
from typing import List

def run_imputation_pipeline(
    input_file: str,
    output_dir: str,
    output_filename: str,
    remove_t0: bool,
    group_regex: str,
    t0_identifiers: List[str]
):
    """
    Loads, optionally removes T0 samples, and imputes missing values within replicate groups.

    Args:
        input_file (str): Path to the input Excel file.
        output_dir (str): Path to the directory for saving the output.
        output_filename (str): Name for the output Excel file.
        remove_t0 (bool): If True, T0 samples are removed before imputation.
        group_regex (str): Regex pattern to identify replicate groups.
        t0_identifiers (List[str]): List of group IDs to be considered T0.
    """
    print(f"--- Starting Imputation Pipeline ---")
    
    # --- 1. Load Data ---
    if not os.path.exists(input_file):
        print(f"ERROR: File not found at '{input_file}'.")
        return

    try:
        df = pd.read_excel(input_file, na_values=['', ' ', '-', '#N/A', 'NULL', 'null'])
        print(f"Successfully loaded data from '{os.path.basename(input_file)}'. Shape: {df.shape}")
    except Exception as e:
        print(f"Error loading Excel file: {e}")
        return

    processed_df = df.copy()

    # --- 2. (Optional) Remove T0 Columns ---
    # <<< FIX: Logic now uses the configurable regex and identifiers >>>
    group_pattern = re.compile(group_regex, re.IGNORECASE)

    if remove_t0:
        print("\nStep 1: Removing T0 samples...")
        columns_to_remove = []
        for col in processed_df.columns:
            match = group_pattern.match(str(col))
            if match:
                # The first captured group is the group ID
                group_id = match.group(1)
                if group_id in t0_identifiers:
                    columns_to_remove.append(col)
        
        if columns_to_remove:
            processed_df = processed_df.drop(columns=columns_to_remove)
            print(f"  Removed {len(columns_to_remove)} T0 columns. New shape: {processed_df.shape}")
        else:
            print("  No T0 columns found to remove based on the provided identifiers.")
    else:
        print("\nStep 1: Skipping T0 sample removal as per configuration.")

    # --- 3. Impute Missing Values ---
    print("\nStep 2: Imputing missing values within replicate groups...")
    timepoint_groups = {}
    
    for col in processed_df.columns:
        # <<< FIX: Use the configurable regex for grouping >>>
        match = group_pattern.match(str(col))
        if match:
            group_id = match.group(1)
            if group_id not in timepoint_groups:
                timepoint_groups[group_id] = []
            timepoint_groups[group_id].append(col)

    if not timepoint_groups:
        print("  WARNING: No replicate groups found for imputation. Check your REPLICATE_GROUP_REGEX in Cell 1.")
        # Save the unprocessed data if no groups are found
        output_path = os.path.join(output_dir, output_filename)
        processed_df.to_excel(output_path, index=False)
        print(f"  Saved the unprocessed data to: '{output_path}'")
        return

    print(f"  Identified {len(timepoint_groups)} groups for imputation.")
    nan_before = processed_df.isna().sum().sum()
    print(f"  NaN count before imputation: {nan_before}")

    # Perform imputation for each group
    for group_id, cols_in_group in timepoint_groups.items():
        if not cols_in_group:
            continue
        
        group_df = processed_df[cols_in_group].apply(pd.to_numeric, errors='coerce')
        row_means = group_df.mean(axis=1, skipna=True)
        processed_df[cols_in_group] = group_df.T.fillna(row_means).T

    nan_after = processed_df.isna().sum().sum()
    print(f"  NaN count after imputation: {nan_after}")
    if nan_after < nan_before:
        print("  Imputation successfully reduced missing values.")
    elif nan_after > 0:
        print("  NOTE: Some NaNs may remain if all replicates in a group were missing for a given row.")

    # --- 4. Save Results ---
    print("\nStep 3: Saving processed data...")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_filename)
    
    try:
        processed_df.to_excel(output_path, index=False)
        print(f"  Successfully saved imputed data to: '{output_path}'")
    except Exception as e:
        print(f"ERROR: Could not save the output file. Details: {e}")
    
    print("\n--- Pipeline Finished ---")

# --- EXECUTION ---
# This block calls the main function with the settings from Cell 1.
if __name__ == "__main__":
    run_imputation_pipeline(
        input_file=INPUT_FILE_PATH,
        output_dir=OUTPUT_DIR,
        output_filename=OUTPUT_FILENAME,
        remove_t0=REMOVE_T0_SAMPLES,
        group_regex=REPLICATE_GROUP_REGEX,
        t0_identifiers=T0_GROUP_IDENTIFIERS
    )

--- Starting Imputation Pipeline ---
Successfully loaded data from 'MOD_RF_Imputed.xlsx'. Shape: (115, 51)

Step 1: Removing T0 samples...
  Removed 10 T0 columns. New shape: (115, 41)

Step 2: Imputing missing values within replicate groups...
  Identified 8 groups for imputation.
  NaN count before imputation: 0
  NaN count after imputation: 0

Step 3: Saving processed data...
  Successfully saved imputed data to: '/users/aranpurdy/desktop/TEST/IMPUTETEST/Ground_Truth_Imputed.xlsx'

--- Pipeline Finished ---
