## Environment
Using Kernel: `dan-dev-py312-r433`

-----


#### Stage the data

- Staging Walker Lab Data from Supplemental Materials to **original_source_data**

**Files:**
> - WC1_Supplemental_Table_11.xlsx
> - WC1_Supplemental_Table_12.xlsx
> - WC1_Supplemental_Table_3.xlsx
> - WC1_Supplemental_Table_5.xlsx
> - WC2_Table_S8.xlsx


In [5]:
!cp ../walker_lab/wormcat_1/Supplemental_Table_3.xlsx ../original_source_data/WC1_Supplemental_Table_3.xlsx
!cp ../walker_lab/wormcat_1/Supplemental_Table_5.xlsx ../original_source_data/WC1_Supplemental_Table_5.xlsx
!cp ../walker_lab/wormcat_1/Supplemental_Table_11.xlsx ../original_source_data/WC1_Supplemental_Table_11.xlsx
!cp ../walker_lab/wormcat_1/Supplemental_Table_12.xlsx ../original_source_data/WC1_Supplemental_Table_12.xlsx
!cp ../walker_lab/wormcat_2/"Table S8.xlsx" ../original_source_data/WC2_Table_S8.xlsx


In [6]:
# What version of wormcat3 are we using?
import wormcat3
print(f"Wormcat3 {wormcat3.__version__}")

Wormcat3 0.1.5


In [7]:
from pathlib import Path

root_dir_path = Path("/Users/dan/Code/Python/wormcat3_paper")
excel_root_path = root_dir_path / "original_source_data"
csv_root_path = root_dir_path / "derived_csv_data"

# Preprocess

### Stage the original source data

- Data is converted into a format that is easily consumed by Wormcat3

In [9]:
# Helper functions to stage the original source data 
import glob
import os
import re
import pandas as pd
import shutil
from pathlib import Path
from wormcat3.wormcat_excel import WormcatExcel


def delete_directory_recursive(csv_root_path):
    """Delete the content of the provided directory and all subdirectories."""
    path = Path(csv_root_path)
    if path.exists() and path.is_dir():
        shutil.rmtree(path)
        print(f"Deleted directory and all contents: {csv_root_path}")
    else:
        print(f"Directory does not exist: {csv_root_path}")
        

def rename_file(file_path: Path, new_name: str) -> Path:
    """Rename the file at file_path to the new_name in the same directory, and return the new path."""
    if not file_path.exists():
        new_path = file_path.with_name(new_name)
        return new_path # Assume it was already moved
        #raise FileNotFoundError(f"File does not exist: {file_path}")

    new_path = file_path.with_name(new_name)
    try:
        file_path.rename(new_path)
        print(f"Renamed: {file_path.name} → {new_name}")
        return new_path
    except Exception as e:
        print(f"Failed to rename {file_path}: {e}")
        return file_path  # fallback
    
    
def move_filtered_files(pattern, destination, regex_pattern=None):
    """
    Move files matching a glob pattern and optional apply a regex filter to the destination folder.
    """
    pattern = str(pattern)
    destination = str(destination)
    os.makedirs(destination, exist_ok=True)
    files = glob.glob(pattern)

    if regex_pattern:
        regex = re.compile(regex_pattern)
        files = [f for f in files if regex.match(os.path.basename(f))]

    for file in files:
        try:
            shutil.move(file, destination)
        except Exception as e:
            print(f"Failed to move {file}: {e}")
            

def extract_all_excels_to_csv(excel_dir: Path, csv_root_dir: Path):
    """
    Read all .xlsx files from excel_dir and extract their sheets as CSVs into
    separate subdirectories under csv_root_dir named after each Excel file stem.
    """
    excel_dir = Path(excel_dir)
    csv_root_dir = Path(csv_root_dir)
    csv_root_dir.mkdir(parents=True, exist_ok=True)

    for excel_file in excel_dir.iterdir():
        if excel_file.suffix.lower() != ".xlsx":
            continue
        
        print(f"Extracting {excel_file.name}…")
        dest_dir = csv_root_dir / excel_file.stem
        dest_dir.mkdir(parents=True, exist_ok=True)

        WormcatExcel.extract_csv_files(excel_file, dest_dir)
        
def remove_index_column(directory):
    for csv_file in directory.rglob("*.csv"):
        df = pd.read_csv(csv_file)
        
        # Check if the first column is 'Unnamed: 0'
        if df.columns[0] == 'Unnamed: 0':
            df = df.drop(columns='Unnamed: 0')
            df.to_csv(csv_file, index=False)
            print(f"Cleaned and saved: {csv_file.name}")
        else:
            print(f"No change needed: {csv_file.name}")
            
def remove_prefix_and_rename_csvs(directory, pattern=r"^\d{1,2} (.+)"):
    """
    Recursively rename CSV files in a directory by removing a prefix
    matched by the given regex pattern from the filename.
    
    By default, removes a leading number and space (e.g., '01 filename.csv' → 'filename.csv').
    """
    directory = Path(directory)
    regex = re.compile(pattern)

    for file in directory.rglob("*.csv"):
        match = regex.match(file.name)
        if match:
            new_name = match.group(1)
            new_path = file.with_name(new_name)
            try:
                file.rename(new_path)
                print(f"Renamed: {file.relative_to(directory)} → {new_name}")
            except Exception as e:
                print(f"Failed to rename {file}: {e}")
                

In [10]:
# 1. Extract all the Excels to CSV Files
extract_all_excels_to_csv(excel_root_path, csv_root_path)

# 2. If a CSV file has a column for the pandas index remove it
remove_index_column(csv_root_path)

# 3. Remove leading number and space from files (e.g., '01 filename.csv' → 'filename.csv').
remove_prefix_and_rename_csvs(csv_root_path)  
remove_prefix_and_rename_csvs(csv_root_path, pattern=r"^\d{1,2}. (.+)") 


Extracting WC1_Supplemental_Table_5.xlsx…
Extracting WC1_Supplemental_Table_12.xlsx…
Extracting WC1_Supplemental_Table_3.xlsx…
Extracting WC2_Table_S8.xlsx…
Extracting WC1_Supplemental_Table_11.xlsx…
Cleaned and saved: 13 D_Rifa_Allan_down_genes.csv
Cleaned and saved: 18T_Rifa_Psora_Allan_down_genes.csv
No change needed: 3 Single Down Cat3.csv
Cleaned and saved: 4 S_Psora_down_genes.csv
No change needed: 2 Single Down Cat2.csv
No change needed: 10 Double Down Cat3.csv
No change needed: 15 Triple Down Cat1.csv
Cleaned and saved: 14 D_Rifa_Psora_down_genes.csv
Cleaned and saved: 5 S_Rapa_down_genes.csv
Cleaned and saved: 19 T_Rifa_Rapa_Allan_down_genes.csv
No change needed: 16 Triple Down Cat2.csv
Cleaned and saved: 6 S_Rifa_down_genes.csv
No change needed: 9 Double Down Cat2.csv
No change needed: 1 Single Down Cat1.csv
Cleaned and saved: 20 T_Rifa_Rapa_Psora_down_genes.csv
No change needed: 17 Triple Down Cat3.csv
Cleaned and saved: 11 D_Rapa_Psora_down_genes.csv
Cleaned and saved: 12 D

In [11]:
# Group Supplemental_Table_3 Data into Subdirectories for Wormcat Batch Execution

supplemental_table_3 = csv_root_path / "WC1_Supplemental_Table_3"

destination = supplemental_table_3 / "Wormcat1_Results"
move_filtered_files(supplemental_table_3 / "Random_*_cat*.csv", destination)

In [12]:
# Group Supplemental_Table_5 Data into Subdirectories for Wormcat Batch Execution

supplemental_table_5 = csv_root_path / "WC1_Supplemental_Table_5"

destination = supplemental_table_5 / "Wormcat1_Results"
move_filtered_files(supplemental_table_5 / "Cat*.csv", destination)
move_filtered_files(supplemental_table_5 / "Table*.csv", destination)

In [13]:
# Group Supplemental_Table_11 Data into Subdirectories for Wormcat Batch Execution

supplemental_table_11 = csv_root_path / "WC1_Supplemental_Table_11"

destination = supplemental_table_11 / "Double_Up"
move_filtered_files(supplemental_table_11 / "D_*.csv", destination)
rename_file(destination / "D_Rapa_Psora_up_genes.csv", "D_Rapa_Psora_up.csv")
rename_file(destination / "D_Rapa_Rifa_up_genes.csv", "D_Rapa_Rifa_up.csv")

destination = Path(supplemental_table_11) / "Single_Up"
move_filtered_files(supplemental_table_11 / "S_*.csv", destination)
rename_file(destination / "S_Psora_up_genes.csv", "S_Psora_up.csv")
rename_file(destination / "S_Rapa_up_genes.csv", "S_Rapa_up.csv")
rename_file(destination / "S_Rifa_up_genes.csv", "S_Rifa_up.csv")

destination = supplemental_table_11 / "Triple_Up"
move_filtered_files(supplemental_table_11 / "T_*.csv", destination)
rename_file(destination / "T_Rifa_Psora_Allan_up_genes.csv", "T_Rifa_Psora_Allan_up.csv")
rename_file(destination / "T_Rifa_Rapa_Allan_up_genes.csv", "T_Rifa_Rapa_Allan_up.csv")
rename_file(destination / "T_Rifa_Rapa_Psora_up_genes.csv", "T_Rifa_Rapa_Psora_up.csv")

# Move the Results for the Original Paper into a Subdirectory
destination = supplemental_table_11 / "Wormcat1_Results"
move_filtered_files(supplemental_table_11 / "*Cat*.csv", destination)
move_filtered_files(supplemental_table_11 / "Table*.csv", destination)



Renamed: D_Rapa_Psora_up_genes.csv → D_Rapa_Psora_up.csv
Renamed: D_Rapa_Rifa_up_genes.csv → D_Rapa_Rifa_up.csv
Renamed: S_Psora_up_genes.csv → S_Psora_up.csv
Renamed: S_Rapa_up_genes.csv → S_Rapa_up.csv
Renamed: S_Rifa_up_genes.csv → S_Rifa_up.csv
Renamed: T_Rifa_Psora_Allan_up_genes.csv → T_Rifa_Psora_Allan_up.csv
Renamed: T_Rifa_Rapa_Allan_up_genes.csv → T_Rifa_Rapa_Allan_up.csv
Renamed: T_Rifa_Rapa_Psora_up_genes.csv → T_Rifa_Rapa_Psora_up.csv


In [14]:
# Group Supplemental_Table_12 Data into Subdirectories for Wormcat Batch Execution

supplemental_table_12 = csv_root_path / "WC1_Supplemental_Table_12"

destination = supplemental_table_12 / "Double_Down"
move_filtered_files(supplemental_table_12 / "D_*.csv", destination)
rename_file(destination / "D_Rapa_Psora_down_genes.csv", "D_Rapa_Psora_down.csv")
rename_file(destination / "D_Rapa_Rifa_down_genes.csv", "D_Rapa_Rifa_down.csv")
rename_file(destination / "D_Rifa_Allan_down_genes.csv", "D_Rifa_Allan_down.csv")
rename_file(destination / "D_Rifa_Psora_down_genes.csv", "D_Rifa_Psora_down.csv")

destination = supplemental_table_12 / "Single_Down"
move_filtered_files(supplemental_table_12 / "S_*.csv", destination)
rename_file(destination / "S_Allan_down_genes.csv", "S_Allan_down.csv")
rename_file(destination / "S_Psora_down_genes.csv", "S_Psora_down.csv")
rename_file(destination / "S_Rapa_down_genes.csv", "S_Rapa_down.csv")
rename_file(destination / "S_Rifa_down_genes.csv", "S_Rifa_down.csv")

rename_file(supplemental_table_12 / "18T_Rifa_Psora_Allan_down_genes.csv", "T_Rifa_Psora_Allan_down_genes.csv")
destination = supplemental_table_12 / "Triple_Down"
move_filtered_files(supplemental_table_12 / "T_*.csv", destination)
rename_file(destination / "T_Rifa_Rapa_Allan_down_genes.csv", "T_Rifa_Rapa_Allan_down.csv")
rename_file(destination / "T_Rifa_Rapa_Psora_down_genes.csv", "T_Rifa_Rapa_Psora_down.csv")
rename_file(destination / "T_Rifa_Psora_Allan_down_genes.csv", "T_Rifa_Psora_Allan_down.csv")


# Move the Results for the Original Paper into a Subdirectory
destination = supplemental_table_12 / "Wormcat1_Results"
move_filtered_files(supplemental_table_12 / "*Cat*.csv", destination)
move_filtered_files(supplemental_table_12 / "Table*.csv", destination)

Renamed: D_Rapa_Psora_down_genes.csv → D_Rapa_Psora_down.csv
Renamed: D_Rapa_Rifa_down_genes.csv → D_Rapa_Rifa_down.csv
Renamed: D_Rifa_Allan_down_genes.csv → D_Rifa_Allan_down.csv
Renamed: D_Rifa_Psora_down_genes.csv → D_Rifa_Psora_down.csv
Renamed: S_Allan_down_genes.csv → S_Allan_down.csv
Renamed: S_Psora_down_genes.csv → S_Psora_down.csv
Renamed: S_Rapa_down_genes.csv → S_Rapa_down.csv
Renamed: S_Rifa_down_genes.csv → S_Rifa_down.csv
Renamed: 18T_Rifa_Psora_Allan_down_genes.csv → T_Rifa_Psora_Allan_down_genes.csv
Renamed: T_Rifa_Rapa_Allan_down_genes.csv → T_Rifa_Rapa_Allan_down.csv
Renamed: T_Rifa_Rapa_Psora_down_genes.csv → T_Rifa_Rapa_Psora_down.csv
Renamed: T_Rifa_Psora_Allan_down_genes.csv → T_Rifa_Psora_Allan_down.csv


In [15]:
# Group Supplemental_Table_S8 Data into Subdirectories for Wormcat Batch Execution


supplemental_table_s8 = csv_root_path / "WC2_Table_S8"

destination = supplemental_table_s8 / "Cat"
move_filtered_files(supplemental_table_s8 / "*_cat.csv", destination)
rename_file(destination / "aging.change_cat.csv", "aging.change.csv")
rename_file(destination / "all_detected_peptides_cat.csv", "all_detected_peptides.csv")
rename_file(destination / "Cytoplasm_cat.csv", "Cytoplasm.csv")

# Move the Results for the Original Paper into a Subdirectory
destination = supplemental_table_s8 / "Wormcat2_Results"
move_filtered_files(supplemental_table_s8 / "Cat*.csv", destination)
move_filtered_files(supplemental_table_s8 / "Legend.csv", destination)

Renamed: aging.change_cat.csv → aging.change.csv
Renamed: all_detected_peptides_cat.csv → all_detected_peptides.csv
Renamed: Cytoplasm_cat.csv → Cytoplasm.csv
