## Environment
Using Kernel: `dan-dev-py312-r433`

-----


In [1]:
# Location of data from and to
from pathlib import Path

root_dir_path = Path("/Users/dan/Code/Python/wormcat3_paper")
source_root_path = root_dir_path / "source_data/kenyon_lab"
derived_root_path = root_dir_path / "derived_data/kenyon_lab"

# Preprocess

### Stage the original source data

- Data is converted into a format that is easily consumed by Wormcat3

In [2]:
# Functions used to Map Sequence IDs to Wormbase IDs

from pathlib import Path
import pandas as pd
from pub_worm.wormbase import wormbase_util as wb
import os

def get_gene_ids_dict(working_dir_path):
    wormbase_version = wb.current_wormbase_version()
    
    gene_ids_csv = Path(f"{working_dir_path}/c_elegans.PRJNA13758.{wormbase_version}.geneIDs.csv")
    if not gene_ids_csv.exists():        
        gene_ids_txt = wb.download_gene_ids(wormbase_version, working_dir_path)
        gene_ids_csv = wb.gene_ids_to_csv(wormbase_version, working_dir_path, status_live=False)
        if os.path.exists(gene_ids_txt):
            os.remove(gene_ids_txt)
            
    gene_ids_df = pd.read_csv(gene_ids_csv).fillna('')
            
    gene_ids_dict = {}
    for _, row in gene_ids_df.iterrows():
        for key in ['Wormbase_Id', 'Gene_name', 'Sequence_id']:
            id_val = str(row[key]).upper()
            gene_ids_dict[id_val] = row.to_dict()

    return gene_ids_dict

def lookup_wormbase_id(sequence_id, gene_ids_dict):
    sequence_id = str(sequence_id)
    found_wormbase_id = wb._lookup_wormbase_id(sequence_id, gene_ids_dict)
    if found_wormbase_id is not None:
        return found_wormbase_id['Wormbase_Id']
    return None



In [3]:
# To map Sequence IDs to Wormbase IDs we build a gene ID dictionary for quick lookups
import pandas as pd

gene_ids_dict = get_gene_ids_dict(derived_root_path.parent)

Downloaded: /Users/dan/Code/Python/wormcat3_paper/derived_data/c_elegans.PRJNA13758.WS296.geneIDs.txt.gz
Unzipped: /Users/dan/Code/Python/wormcat3_paper/derived_data/c_elegans.PRJNA13758.WS296.geneIDs.txt.gz
Processed file saved to: /Users/dan/Code/Python/wormcat3_paper/derived_data/c_elegans.PRJNA13758.WS296.geneIDs.csv


In [4]:
import pandas as pd

input_excel = pd.ExcelFile(f'{source_root_path}/mmc5.xlsx')
# Read the desired sheet (default is the first sheet) and skip the first 6 rows
df = input_excel.parse(skiprows=6)

df.head()

  warn(msg)


Unnamed: 0,Proteins,Log2.Ratio.L.M.normalized.Exp1,Log2.Ratio.L.H.normalized.Exp1,Log2.Ratio.M.H.normalized.Exp1,Log2.Ratio.L.M.normalized.Exp2,Log2.Ratio.L.H.normalized.Exp2,Log2.Ratio.M.H.normalized.Exp2,Log2.Ratio.L.M.normalized.Exp3,Log2.Ratio.L.H.normalized.Exp3,Log2.Ratio.M.H.normalized.Exp3,Mean.p.value,Std.error.of.mean,Log2.Ratio.L.H.normalized.mean,Log2.Ratio.L.M.normalized.mean,Log2.Ratio.M.H.normalized.mean,First.UniProt.ID,Worm.Gene,Human.ENSP,Human.ENSG
0,A3FPJ3,0.119401,-1.422287,-1.227402,-0.28558,-0.86132,0.024972,-0.831472,-2.198023,-1.245009,0.046966,3.07e-06,-1.493877,-0.33255,-0.815813,A3FPJ3,F07G6.10,,
1,A3FPK9,0.738842,1.800274,1.086262,1.070179,2.211465,1.045595,1.396745,2.306288,0.844354,2.3e-05,2.35e-09,2.106009,1.068589,0.992071,A3FPK9,ttr-34,,
2,A4F333,0.114106,1.373271,1.274704,-0.379843,1.535248,1.908117,0.738229,2.237697,1.511999,0.001028,8.72e-07,1.715405,0.157497,1.56494,A4F333,fbxa-37,,
3,A5HU91,1.101536,2.097456,1.006942,1.625222,2.571064,0.965418,2.015841,2.627716,0.651609,4.2e-05,3.48e-09,2.432079,1.580866,0.874656,A5HU91,ZK813.7,,
4,A5JYT3,-0.166458,-1.457384,-1.155231,-0.830783,-1.228295,-0.611692,-0.876723,-1.331447,-0.314523,0.004159,4.83e-07,-1.339042,-0.624655,-0.693815,A5JYT3,C01H6.8,,


In [5]:
# Get the Wormbase IDs

df.insert(
    0,  # position as the first column
    'Wormbase_Id',
    df['Worm.Gene'].apply(lambda sequence_id: lookup_wormbase_id(sequence_id, gene_ids_dict))
        )

In [6]:
# Show some Summary Stats

num_not_found = df['Wormbase_Id'].isna().sum()
total = len(df)
num_found = total - num_not_found
percent_found = num_found / total * 100


print(f"Found     {num_found:>6,} genes.")
print(f"Not Found {num_not_found:>6,} genes.")
print(f"Processed {total:>6,} genes.  {percent_found:.2f}% matched.")
print("="*40)

Found        612 genes.
Not Found     15 genes.
Processed    627 genes.  97.61% matched.


In [7]:
# Create the output CSV file with the Wormbase IDs
import os
os.makedirs(derived_root_path, exist_ok=True)
output_file = f"{derived_root_path}/kenyon.csv"
df.to_csv(output_file, index=False)   