In [None]:
import sys
from pathlib import Path

# Add pub_worm directory to the Python path
sys.path.insert(0, str(Path.cwd().parent))

# Test where we are loading the pub_worm objects from
import inspect
from pub_worm.wormbase.wormbase_api import WormbaseAPI

module = inspect.getmodule(WormbaseAPI)
if hasattr(module, "__file__"):
    print("WormbaseAPI imported from:", module.__file__)
else:
    print("Could not determine the file path.")

# Map Wormbase IDs

This API maps **Sequence IDs** to the latest **Wormbase Gene IDs**.

### How it works:
- The application first checks if the latest gene ID mapping file is available locally.
- If the file is not found, it downloads the most recent mapping data from Wormbase.
- The Sequence IDs provided in your file are then mapped to their corresponding current Wormbase Gene IDs using this data.


### Installation

To install the package, run:

```bash
pip install pub-worm
```
- **Recommended**: Use within a **Conda environment** running **Python 3.12 or later** for optimal compatibility and performance.

### Available Parameters

- **sequence_ids_file_path** *(required)*  
  Path to the input file containing sequence IDs.

- **column_name** *(optional, default: `'ID'`)*  
  Name of the column in the input file that contains the sequence IDs to be mapped.

- **working_directory** *(optional, default: same as `sequence_ids_file_path`)*  
  Directory where output and intermediate files will be saved.

- **gene_ids_df** *(optional)*  
  A `pandas.DataFrame` containing the gene ID mapping data.  
  If not provided, the latest version will be downloaded automatically from [wormbase.org](https://wormbase.org).

In [None]:
# The simplest usage of map_wormbase_ids is to pass in sequence_ids.csv with a header of ID
from pub_worm.wormbase.wormbase_util import map_wormbase_ids
 
sequence_ids_file_path = "./wormbase_data/sequence_ids.csv"
map_wormbase_ids(sequence_ids_file_path)

In [None]:
import pandas as pd
wS296_gene_ids_df = pd.read_csv("./wormbase_data/c_elegans.PRJNA13758.WS296.geneIDs.csv")
map_wormbase_ids(sequence_ids_file_path, gene_ids_df=wS296_gene_ids_df)


In [None]:

import pandas as pd
sequence_ids_df =pd.read_csv("./wormbase_data/sequence_ids.csv")
sequence_ids_df

In [None]:
import pandas as pd
gprofiler_df =pd.read_csv("./wormbase_data/gProfiler_celegans.csv")
gprofiler_df

In [None]:


not_found_pub_worm_df = pd.read_csv('./wormbase_data/wormbase_ids_not_found.csv')
not_found_pub_worm_df

### Running the Application from the Command Line

To use this tool from the command line:

1. Save the script below as `run_map_wormbase_ids.py`.

#### Example Usage

- Run with default column (`ID`) and output to the same directory as the input file:  
  `python run_map_wormbase_ids.py path/to/sequence_ids.csv`

- Run with a custom column name (`Wormbase_Id`) and specify an output directory:  
  `python run_map_wormbase_ids.py path/to/sequence_ids.csv Wormbase_Id ./output_dir`

```python
# save this as: run_map_wormbase_ids.py

import sys
from pathlib import Path
from pub_worm.wormbase.wormbase_util import map_wormbase_ids

def main():
    if len(sys.argv) < 2:
        print("Usage: python run_map_wormbase_ids.py <sequence_ids_file_path> [column_name] [working_directory]")
        sys.exit(1)

    sequence_ids_file_path = Path(sys.argv[1])
    column_name = sys.argv[2] if len(sys.argv) > 2 else "ID"
    working_directory = Path(sys.argv[3]) if len(sys.argv) > 3 else sequence_ids_file_path.parent

    map_wormbase_ids(
        sequence_ids_file_path=sequence_ids_file_path,
        column_name=column_name,
        working_dir_path=working_directory
    )

if __name__ == "__main__":
    main()
```