### Conda Environment

Using Kernel: `dan-dev-py312-r433`

-----

#### Stage the data

- Staging Murphy Lab Data from Supplemental Materials to **original_source_data**

**Files:**
> - WMurphy_TS.xlsx

---- 

- **Helper functions**


In [None]:
import pandas as pd
from pathlib import Path

import pandas as pd

def read_excel_sheets(file_path):
    """
    Reads an Excel file with multiple sheets into a dictionary of DataFrames.
    """
    try:
        sheet_dict = pd.read_excel(file_path, sheet_name=None)
        return sheet_dict
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return {}
            

def write_gene_ids_to_excel(sheet_dict, output_file):
    """
    Writes a new Excel file with one sheet per key in the dictionary.
    Each sheet contains only the 'WormBase Gene ID' column from the original
    DataFrame, renamed to 'Wormbase ID'.
    """
    directory = Path(output_file).parent
    directory.mkdir(parents=True, exist_ok=True)
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        for sheet_name, df in sheet_dict.items():
            if 'WormBase Gene ID' in df.columns:
                gene_ids_df = df[['WormBase Gene ID']].copy()
                gene_ids_df.rename(columns={'WormBase Gene ID': 'Wormbase ID'}, inplace=True)
                gene_ids_df.to_excel(writer, sheet_name=sheet_name, index=False)
            else:
                print(f"Warning: 'WormBase Gene ID' column not found in sheet '{sheet_name}'. Skipping.")
                
                

-----

- **Preprocess**

In [None]:
# Recreate the Murphy_TS.xlsx from the source material

sheet_dict = read_excel_sheets("../source_data/murphy_lab/pgen.1007559.s010.xlsx")
write_gene_ids_to_excel(sheet_dict, "../derived_data/murphy_lab/Murphy_TS.xlsx")