## Environment
Using Kernel: `dan-dev-py312-r433`

-----


# Walker Lab Preprocess

#### Stage the data

- Staging Data from **source_data** to **derived_data**

**Derived Directories:**
> - low_sam_s009
> - low_sam_s012
> - cut_and_tag
> - wc1_random
> - wc1_sams
> - wc2_peptides


-----
- **Helper functions**

In [1]:
# Functions used to Map Sequence IDs to Wormbase IDs

from pathlib import Path
import pandas as pd
from pub_worm.wormbase import wormbase_util as wb
import os

def get_gene_ids_dict(working_dir_path):
    wormbase_version = wb.current_wormbase_version()
    
    gene_ids_csv = Path(f"{working_dir_path}/c_elegans.PRJNA13758.{wormbase_version}.geneIDs.csv")
    if not gene_ids_csv.exists():        
        gene_ids_txt = wb.download_gene_ids(wormbase_version, working_dir_path)
        gene_ids_csv = wb.gene_ids_to_csv(wormbase_version, working_dir_path, status_live=False)
        if os.path.exists(gene_ids_txt):
            os.remove(gene_ids_txt)
            
    gene_ids_df = pd.read_csv(gene_ids_csv).fillna('')
            
    gene_ids_dict = {}
    for _, row in gene_ids_df.iterrows():
        for key in ['Wormbase_Id', 'Gene_name', 'Sequence_id']:
            id_val = str(row[key]).upper()
            gene_ids_dict[id_val] = row.to_dict()

    return gene_ids_dict

def lookup_wormbase_id(sequence_id, gene_ids_dict):
    sequence_id = str(sequence_id)
    found_wormbase_id = wb._lookup_wormbase_id(sequence_id, gene_ids_dict)
    if found_wormbase_id is not None:
        return found_wormbase_id['Wormbase_Id']
    return None

def read_csvs_to_dict(root_dir):
    """
    Recursively find all CSV files in root_dir and read them into a dictionary of DataFrames.
    The key is the relative path (as string) from root_dir.
    """
    root_path = Path(root_dir)
    csv_files = root_path.rglob('*.csv')
    
    dataframes = {}
    for file_path in csv_files:
        if file_path.name.endswith('.geneIDs.csv'):
            continue  # skip this file
        try:
            relative_path = file_path.relative_to(root_path)
            dataframes[str(relative_path)] = pd.read_csv(file_path)
        except Exception as e:
            print(f"Failed to read {file_path}: {e}")
    
    return dataframes

def process_csvs(dest_dir):
    csvs_dict = read_csvs_to_dict(dest_dir)
    for file_path, df in csvs_dict.items():
        output_file = dest_dir / file_path
        df.insert(
            0,  # position as the first column
            'Wormbase_Id',
            df['ID'].apply(lambda sequence_id: lookup_wormbase_id(sequence_id, gene_ids_dict))
        )
        
        num_not_found = df['Wormbase_Id'].isna().sum()
        total = len(df)
        num_found = total - num_not_found
        percent_found = num_found / total * 100
        
        print(output_file.name)
        print(f"Found     {num_found:>6,} genes.")
        print(f"Not Found {num_not_found:>6,} genes.")
        print(f"Processed {total:>6,} genes.  {percent_found:.2f}% matched.")
        print("="*40)
        
        df.to_csv(output_file, index=False)   



-----

- **Preprocess**

In [2]:
# Location of data from and to
from pathlib import Path

source_data_path = Path("../source_data/walker_lab")
derived_data_path = Path("../derived_data/walker_lab")


In [3]:
# To map Sequence IDs to Wormbase IDs we build a gene ID dictionary for quick lookups
import pandas as pd

gene_ids_dict = get_gene_ids_dict(derived_data_path.parent)

In [4]:
import pandas as pd

excel_file = source_data_path / "low_sam" / "pgen.1007812.s009.xlsx"
dest_dir   = derived_data_path / "low_sam_s009"

sheet_map = {
    'sams-1 ALL genes': 'all_detected/sams-1_all.csv',
    'sams-1 UP': 'basal_conditions/sams-1_up.csv',
    'sams-1 DOWN': 'basal_conditions/sams-1_down.csv',
    'set-2 ALL': 'all_detected/set-2_all.csv',
    'set-2 UP': 'basal_conditions/set-2_up.csv',
    'set-2 DOWN': 'basal_conditions/set-2_down.csv',
    'set-16 ALL': 'all_detected/set-16_all.csv',
    'set-16 UP': 'basal_conditions/set-16_up.csv',
    'set-16 DOWN': 'basal_conditions/set-16_down.csv'
}
        
input_excel = pd.ExcelFile(excel_file)
for sheet in input_excel.sheet_names:
    sheet_df = input_excel.parse(sheet)
    output_file = dest_dir / sheet_map[sheet]
    print(output_file.name)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    sheet_df.to_csv(output_file, index=False)            

            
process_csvs(dest_dir)

sams-1_all.csv
sams-1_up.csv
sams-1_down.csv
set-2_all.csv
set-2_up.csv
set-2_down.csv
set-16_all.csv
set-16_up.csv
set-16_down.csv
set-16_all.csv
Found     16,581 genes.
Not Found    125 genes.
Processed 16,706 genes.  99.25% matched.
set-2_all.csv
Found     16,513 genes.
Not Found    128 genes.
Processed 16,641 genes.  99.23% matched.
sams-1_all.csv
Found     17,729 genes.
Not Found    139 genes.
Processed 17,868 genes.  99.22% matched.
sams-1_down.csv
Found      2,316 genes.
Not Found      5 genes.
Processed  2,321 genes.  99.78% matched.
set-2_up.csv
Found          1 genes.
Not Found      0 genes.
Processed      1 genes.  100.00% matched.
set-16_up.csv
Found         30 genes.
Not Found      0 genes.
Processed     30 genes.  100.00% matched.
sams-1_up.csv
Found      1,198 genes.
Not Found      5 genes.
Processed  1,203 genes.  99.58% matched.
set-2_down.csv
Found          0 genes.
Not Found      2 genes.
Processed      2 genes.  0.00% matched.
set-16_down.csv
Found         49 genes.

In [5]:

excel_file = source_data_path / "low_sam" / "pgen.1007812.s012.xlsx"
dest_dir   = derived_data_path / "low_sam_s012"

sheet_map = {
    'control UP Heat'  :'heat_shock/control_up.csv',
    'control DOWN Heat':'heat_shock/control_down.csv',
    'sams UP Heat'     :'heat_shock/sams-1_up.csv',
    'sams DOWN Heat'   :'heat_shock/sams-1_down.csv',
    'set2 UP Heat'     :'heat_shock/set-2_up.csv',
    'set2 DOWN Heat'   :'heat_shock/set-2_down.csv',
    'set16 UP Heat'    :'heat_shock/set-16_up.csv',
    'set16 DOWN Heat'  :'heat_shock/set-16_down.csv'
}
        
input_excel = pd.ExcelFile(excel_file)
for sheet in input_excel.sheet_names:
    sheet_df = input_excel.parse(sheet)
    sheet_df = sheet_df.rename(columns={'wormbase_id': 'Wormbase_Id'})
    output_file = dest_dir / sheet_map[sheet]
    print(output_file.name)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    sheet_df.to_csv(output_file, index=False)            



control_up.csv
sams-1_up.csv
set-2_up.csv
set-16_up.csv
control_down.csv
sams-1_down.csv
set-2_down.csv
set-16_down.csv


In [6]:

excel_file = source_data_path / "cut_and_tag" / "elife-79511-supp3-v3.xlsx"
dest_dir   = derived_data_path / "cut_and_tag"

sheet_map = {
    'Legend'        :'cut_and_tag_results/legend.csv',
    'A. Control_15' :'cut_and_tag/control_15.csv',
    'B. Control_37' :'cut_and_tag/control_37.csv',
    'C. S1_15'      :'cut_and_tag/sams-1_15.csv',
    'D. S1_37'      :'cut_and_tag/sams-1_37.csv',
    'E. S4_15'      :'cut_and_tag/sams-4_15.csv',
    'F. S4_37'      :'cut_and_tag/sams-4_37.csv',
    'G. Cat1'       :'cut_and_tag_results/cat1.csv',
    'H. Cat2'       :'cut_and_tag_results/cat2.csv',
    'I. Cat3'       :'cut_and_tag_results/cat3.csv'
}
        
input_excel = pd.ExcelFile(excel_file)
for sheet in input_excel.sheet_names:
    sheet_df = input_excel.parse(sheet)
    print(sheet)
    if 'Unnamed: 0'in sheet_df.columns:
        sheet_df = sheet_df.drop(columns='Unnamed: 0')
        
    sheet_df.columns = [col.strip().replace(' ', '_') for col in sheet_df.columns]
    if 'Wormbase_ID' in sheet_df.columns:
        # Rename the column
        sheet_df = sheet_df.rename(columns={'Wormbase_ID': 'Wormbase_Id'})
        # Move 'Wormbase_Id' to the first column
        cols = ['Wormbase_Id'] + [col for col in sheet_df.columns if col != 'Wormbase_Id']
        sheet_df = sheet_df[cols]
    output_file = dest_dir / sheet_map[sheet]
    output_file.parent.mkdir(parents=True, exist_ok=True)
    sheet_df.to_csv(output_file, index=False) 

Legend
A. Control_15
B. Control_37
C. S1_15
D. S1_37
E. S4_15
F. S4_37
G. Cat1
H. Cat2
I. Cat3


In [7]:

excel_file = source_data_path / "wormcat_1" / "Supplemental_Table_3.xlsx"
dest_dir   = derived_data_path / "wc1_random"

sheet_map = {
	'Table S3 Legend'      :'wormcat1_results/legend.csv',
	'1 Random_100_genes'   :'random/random_100.csv',
	'2 Random_100_cat1'    :'wormcat1_results/random_100_cat1.csv',
	'3 Random_100_cat2'    :'wormcat1_results/random_100_cat2.csv',
	'4 Random_100_cat3'    :'wormcat1_results/random_100_cat3.csv',
	'5 Random_500_genes'   :'random/random_500.csv',
	'6 Random_500_cat1'    :'wormcat1_results/random_500_cat1.csv',
	'7 Random_500_cat2'    :'wormcat1_results/random_500_cat2.csv',
	'8 Random_500_cat3'    :'wormcat1_results/random_500_cat3.csv',
	'9 Random_1000_genes'  :'random/random_1000.csv',
	'10 Random_1000_cat1'  :'wormcat1_results/random_1000_cat1.csv',
	'11 Random_1000_cat2'  :'wormcat1_results/random_1000_cat2.csv',
	'12 Random_1000_cat3'  :'wormcat1_results/random_1000_cat3.csv',
	'13 Random_1500_genes' :'random/random_1500.csv',
	'14 Random_1500_cat1'  :'wormcat1_results/random_1500_cat1.csv',
	'15 Random_1500_cat2'  :'wormcat1_results/random_1500_cat2.csv',
	'16 Random_1500_cat3'  :'wormcat1_results/random_1500_cat3.csv'
}
        
input_excel = pd.ExcelFile(excel_file)
for sheet in input_excel.sheet_names:
    sheet_df = input_excel.parse(sheet)
    print(sheet)
    if 'Unnamed: 0'in sheet_df.columns:
        sheet_df = sheet_df.drop(columns='Unnamed: 0')
    if 'Wormbase.ID' in sheet_df.columns:
        # Rename the column
        sheet_df = sheet_df.rename(columns={'Wormbase.ID': 'Wormbase_Id'})
        cols = ['Wormbase_Id'] + [col for col in sheet_df.columns if col != 'Wormbase_Id']
        sheet_df = sheet_df[cols]
    output_file = dest_dir / sheet_map[sheet]
    output_file.parent.mkdir(parents=True, exist_ok=True)
    sheet_df.to_csv(output_file, index=False) 

Table S3 Legend
1 Random_100_genes
2 Random_100_cat1
3 Random_100_cat2
4 Random_100_cat3
5 Random_500_genes
6 Random_500_cat1
7 Random_500_cat2
8 Random_500_cat3
9 Random_1000_genes
10 Random_1000_cat1
11 Random_1000_cat2
12 Random_1000_cat3
13 Random_1500_genes
14 Random_1500_cat1
15 Random_1500_cat2
16 Random_1500_cat3


In [8]:
# Process Supplemental_Table_5 Data of Wormcat Paper 1

excel_file = source_data_path / "wormcat_1" / "Supplemental_Table_5.xlsx"
dest_dir   = derived_data_path / "wc1_sams"

sheet_map = {
	'Table S5 Legend'      :'wormcat1_results/legend.csv',
	'1 Cat1'               :'wormcat1_results/cat1.csv',
	'2 Cat2'               :'wormcat1_results/cat2.csv',
	'3 Cat3'               :'wormcat1_results/cat3.csv',
	'4 sams_up_genes'      :'sams/sams_up.csv',
	'5 sams_down_genes'    :'sams/sams_down.csv',
	'6 sams_up_CH_genes'   :'sams/sams_up_ch.csv',
	'7 sams_down_CH_genes' :'sams/sams_down_ch.csv'
}

input_excel = pd.ExcelFile(excel_file)
for sheet in input_excel.sheet_names:
    sheet_df = input_excel.parse(sheet)
    print(sheet)
    if 'Unnamed: 0'in sheet_df.columns:
        sheet_df = sheet_df.drop(columns='Unnamed: 0')
    if 'Wormbase.ID' in sheet_df.columns:
        # Rename the column
        sheet_df = sheet_df.rename(columns={'Wormbase.ID': 'Wormbase_Id'})
        cols = ['Wormbase_Id'] + [col for col in sheet_df.columns if col != 'Wormbase_Id']
        sheet_df = sheet_df[cols]
    output_file = dest_dir / sheet_map[sheet]
    output_file.parent.mkdir(parents=True, exist_ok=True)
    sheet_df.to_csv(output_file, index=False) 

Table S5 Legend
1 Cat1
2 Cat2
3 Cat3
4 sams_up_genes
5 sams_down_genes
6 sams_up_CH_genes
7 sams_down_CH_genes


In [9]:
# Group Supplemental_Table_S8 Data into Subdirectories for Wormcat Batch Execution

excel_file = source_data_path / "wormcat_2" / "Table S8.xlsx"
dest_dir   = derived_data_path / "wc2_peptides"

sheet_map = {
	'Legend'                       :'wormcat2_results/legend.csv',
	'1. Cat1'                      :'wormcat2_results/cat1.csv',
	'2. Cat2'                      :'wormcat2_results/cat2.csv',
	'3. Cat3'                      :'wormcat2_results/cat3.csv',
	'4. all_detected_peptides_cat' :'peptides/all_detected_peptides.csv',
	'5. aging.change_cat'          :'peptides/aging_change.csv',
	'6. Cytoplasm_cat'             :'peptides/cytoplasm.csv'
}

input_excel = pd.ExcelFile(excel_file)
for sheet in input_excel.sheet_names:
    sheet_df = input_excel.parse(sheet)
    print(sheet)
    if 'Unnamed: 0'in sheet_df.columns:
        sheet_df = sheet_df.drop(columns='Unnamed: 0')
    if 'Wormbase.ID' in sheet_df.columns:
        # Rename the column
        sheet_df = sheet_df.rename(columns={'Wormbase.ID': 'Wormbase_Id'})
        cols = ['Wormbase_Id'] + [col for col in sheet_df.columns if col != 'Wormbase_Id']
        sheet_df = sheet_df[cols]
    output_file = dest_dir / sheet_map[sheet]
    output_file.parent.mkdir(parents=True, exist_ok=True)
    sheet_df.to_csv(output_file, index=False) 
    

Legend
1. Cat1
2. Cat2
3. Cat3
4. all_detected_peptides_cat
5. aging.change_cat
6. Cytoplasm_cat
