In [None]:
# ============================================================================
# CONFIGURATION
# This cell contains all the settings a user needs to change.
# ============================================================================

# --- PATHS ---
# TODO: Update this path to point to your complete, "ground truth" data file.
INPUT_FILE_PATH = "data/Ground_Truth_Imputed.xlsx"

# The directory where the new datasets with missing values will be saved.
OUTPUT_DIR = "output/ArtificialMissingness"


# --- PARAMETERS ---
# The overall proportion of missing values to introduce into the data.
# For example, 0.15 means 15% of the data will be made missing.
MISSING_RATE = 0.15

# For the MAR pattern, which column should be used to determine missingness in others?
# If set to None, the script will automatically use the first numeric column.
MAR_CONDITIONAL_COLUMN = None  # or e.g., "Specific_Column_Name"

# For the MNAR pattern, values below this percentile in a column are more likely to be missing.
MNAR_THRESHOLD_PERCENTILE = 0.25

In [None]:
# ============================================================================
# SCRIPT LOGIC (Functions)
# This cell contains the core logic of the script.
# A user typically runs this once and does not need to edit it.
# ============================================================================
import pandas as pd
import numpy as np
import os
import time

# --- Helper Functions for Introducing Missingness ---

def introduce_mcar(df: pd.DataFrame, missing_rate: float, target_cols: list) -> pd.DataFrame:
    """Introduces Missing Completely At Random (MCAR)."""
    df_mcar = df.copy()
    for col in target_cols:
        # Create a boolean mask: True for values that will become NaN
        mask = np.random.choice([True, False], size=len(df_mcar), p=[missing_rate, 1 - missing_rate])
        df_mcar.loc[mask, col] = np.nan
    return df_mcar

def introduce_mar(df: pd.DataFrame, missing_rate: float, target_cols: list, conditional_col: str) -> pd.DataFrame:
    """Introduces Missing At Random (MAR)."""
    df_mar = df.copy()
    if conditional_col not in df_mar.columns or not pd.api.types.is_numeric_dtype(df_mar[conditional_col]):
        print(f"  WARNING: MAR conditional column '{conditional_col}' not found or not numeric. Skipping MAR generation.")
        return df
        
    prob_high = min(missing_rate * 1.5, 1.0)
    prob_low = max(missing_rate * 0.5, 0.0)
    threshold = df_mar[conditional_col].quantile(0.75)
    
    for col in target_cols:
        if col == conditional_col:
            continue
        
        high_mask = (df_mar[conditional_col] > threshold)
        low_mask = ~high_mask
        
        df_mar.loc[high_mask, col] = df_mar.loc[high_mask, col].apply(lambda x: np.nan if np.random.rand() < prob_high else x)
        df_mar.loc[low_mask, col] = df_mar.loc[low_mask, col].apply(lambda x: np.nan if np.random.rand() < prob_low else x)
        
    return df_mar

def introduce_mnar(df: pd.DataFrame, missing_rate: float, target_cols: list, threshold_percentile: float) -> pd.DataFrame:
    """Introduces Missing Not At Random (MNAR)."""
    df_mnar = df.copy()
    
    prob_high_val = min(missing_rate * 0.5, 1.0)
    prob_low_val = max(missing_rate * 1.5, 0.0)

    for col in target_cols:
        if pd.api.types.is_numeric_dtype(df_mnar[col]) and not df_mnar[col].dropna().empty:
            threshold = df_mnar[col].quantile(threshold_percentile)
            high_val_mask = (df_mnar[col] > threshold)
            low_val_mask = ~high_val_mask
            df_mnar.loc[high_val_mask, col] = df_mnar.loc[high_val_mask, col].apply(lambda x: np.nan if np.random.rand() < prob_high_val else x)
            df_mnar.loc[low_val_mask, col] = df_mnar.loc[low_val_mask, col].apply(lambda x: np.nan if np.random.rand() < prob_low_val else x)
            
    return df_mnar

# --- Main Pipeline Function ---

def run_pattern_generation(input_file, output_dir, missing_rate, mar_conditional_col, mnar_threshold):
    """Orchestrates the entire process of generating and saving datasets with missing data."""
    print("--- Starting Missing Data Generation Pipeline ---")
    
    os.makedirs(output_dir, exist_ok=True)
    print(f"Output will be saved to: '{output_dir}'")

    try:
        df_complete = pd.read_excel(input_file)
        print(f"Successfully loaded data from '{os.path.basename(input_file)}'. Shape: {df_complete.shape}")
    except FileNotFoundError:
        print(f"ERROR: The file was not found at '{input_file}'")
        return

    numeric_cols = df_complete.select_dtypes(include=np.number).columns.tolist()
    if not numeric_cols:
        print("ERROR: No numeric columns found in the dataset. Exiting.")
        return
    print(f"Found {len(numeric_cols)} numeric columns to target.")

    # --- Introduce MCAR ---
    print("\n1. Introducing MCAR pattern...")
    start_time = time.time()
    df_mcar = introduce_mcar(df_complete, missing_rate, numeric_cols)
    print(f"  Completed in {time.time() - start_time:.2f} seconds.")
    mcar_output_path = os.path.join(output_dir, "mcar_data.xlsx")
    df_mcar.to_excel(mcar_output_path, index=False)
    print(f"  MCAR data saved to: {mcar_output_path}")

    # --- Introduce MAR ---
    print("\n2. Introducing MAR pattern...")
    if len(numeric_cols) > 1:
        start_time = time.time()
        if mar_conditional_col is None or mar_conditional_col not in numeric_cols:
            mar_conditional_col = numeric_cols[0]
        
        print(f"  Using '{mar_conditional_col}' as the conditional column.")
        df_mar = introduce_mar(df_complete, missing_rate, numeric_cols, mar_conditional_col)
        print(f"  Completed in {time.time() - start_time:.2f} seconds.")
        mar_output_path = os.path.join(output_dir, "mar_data.xlsx")
        df_mar.to_excel(mar_output_path, index=False)
        print(f"  MAR data saved to: {mar_output_path}")
    else:
        print("  Skipping MAR generation: Not enough numeric columns (need at least 2).")

    # --- Introduce MNAR ---
    print("\n3. Introducing MNAR pattern...")
    start_time = time.time()
    df_mnar = introduce_mnar(df_complete, missing_rate, numeric_cols, mnar_threshold)
    print(f"  Completed in {time.time() - start_time:.2f} seconds.")
    mnar_output_path = os.path.join(output_dir, "mnar_data.xlsx")
    df_mnar.to_excel(mnar_output_path, index=False)
    print(f"  MNAR data saved to: {mnar_output_path}")
    
    print("\n--- Pipeline Finished ---")

In [None]:
# ============================================================================
# EXECUTION
# This cell calls the main function using the variables from the Configuration cell.
# ============================================================================

run_pattern_generation(
    input_file=INPUT_FILE_PATH,
    output_dir=OUTPUT_DIR,
    missing_rate=MISSING_RATE,
    mar_conditional_col=MAR_CONDITIONAL_COLUMN,
    mnar_threshold=MNAR_THRESHOLD_PERCENTILE
)