## Environment
Using Kernel: `dan-dev-py312-r433`

-----


# Kenyon Lab Preprocess

#### Stage the data

- Staging Data from **source_data** to **derived_data**

**Files Derived:**
> - kenyon.csv

-----
- **Helper functions**

In [None]:

# Functions used to Map Sequence IDs to Wormbase IDs

from pathlib import Path
import pandas as pd
from pub_worm.wormbase import wormbase_util as wb
import os

def get_gene_ids_dict(working_dir_path):
    wormbase_version = wb.current_wormbase_version()
    
    gene_ids_csv = Path(f"{working_dir_path}/c_elegans.PRJNA13758.{wormbase_version}.geneIDs.csv")
    if not gene_ids_csv.exists():        
        gene_ids_txt = wb.download_gene_ids(wormbase_version, working_dir_path)
        gene_ids_csv = wb.gene_ids_to_csv(wormbase_version, working_dir_path, status_live=False)
        if os.path.exists(gene_ids_txt):
            os.remove(gene_ids_txt)
            
    gene_ids_df = pd.read_csv(gene_ids_csv).fillna('')
            
    gene_ids_dict = {}
    for _, row in gene_ids_df.iterrows():
        for key in ['Wormbase_Id', 'Gene_name', 'Sequence_id']:
            id_val = str(row[key]).upper()
            gene_ids_dict[id_val] = row.to_dict()

    return gene_ids_dict

def lookup_wormbase_id(sequence_id, gene_ids_dict):
    sequence_id = str(sequence_id)
    found_wormbase_id = wb._lookup_wormbase_id(sequence_id, gene_ids_dict)
    if found_wormbase_id is not None:
        return found_wormbase_id['Wormbase_Id']
    return None


-----

- **Preprocess**

In [14]:
source_data_path = "../source_data/kenyon_lab"
derived_data_path = "../derived_data/kenyon_lab"

In [7]:
# To map Sequence IDs to Wormbase IDs we build a gene ID dictionary for quick lookups
import pandas as pd

gene_ids_dict = get_gene_ids_dict(Path(derived_data_path).parent)

In [None]:
# Read the desired Excel (default is the first sheet) and skip the first 6 rows

input_excel = pd.ExcelFile(f'{source_data_path}/mmc5.xlsx')
df = input_excel.parse(skiprows=6)

#df.head()

  warn(msg)


In [10]:
# Map the Sequence IDs to the Wormbase_Id

df.insert(
    0,  # position as the first column
    'Wormbase_Id',
    df['Worm.Gene'].apply(lambda sequence_id: lookup_wormbase_id(sequence_id, gene_ids_dict))
)

In [11]:
# Show some Summary Stats

num_not_found = df['Wormbase_Id'].isna().sum()
total = len(df)
num_found = total - num_not_found
percent_found = num_found / total * 100

print(f"Found     {num_found:>6,} genes.")
print(f"Not Found {num_not_found:>6,} genes.")
print(f"Processed {total:>6,} genes.  {percent_found:.2f}% matched.")
print("="*40)

Found        612 genes.
Not Found     15 genes.
Processed    627 genes.  97.61% matched.


In [15]:
# Create the output CSV file with the Wormbase IDs
import os
os.makedirs(derived_data_path, exist_ok=True)
output_file = f"{derived_data_path}/kenyon.csv"
df.to_csv(output_file, index=False)   