### Download and Extract the Latest Gene IDs from WormBase

This code automates the process of downloading and extracting gene IDs from WormBase. 

1. Get the Latest Wormbase Version
2. Download Wormbase based on Wormbase Naming convention
3. Extract the Live genes to a CVS formatted file

In [2]:
from pub_worm.wormbase.wormbase_util import current_wormbase_version, download_gene_ids, extract_live_gene_ids, download_annotation_file, annotation_files_list

output_dir = "./wormbase_data"

wormbase_version = current_wormbase_version()
print(f"The latest Wormbase Version is {wormbase_version}")

# Download Gene IDs for the given Wormbase Version
download_gene_ids(wormbase_version, output_dir)

# Extract only the Live Genes for the given Wormbase Genes txt file
extract_live_gene_ids(wormbase_version, output_dir)

# Download Life Stages for the given Wormbase Version
download_annotation_file(wormbase_version, "expr_graph.csv.gz", output_dir)

#download_annotation_file(wormbase_version, "functional_descriptions.txt.gz", output_dir)

# List of other files available for download
annotation_files = annotation_files_list(wormbase_version)
print(annotation_files)

The latest Wormbase Version is WS293
Downloaded: ./wormbase_data/c_elegans.PRJNA13758.WS293.geneIDs.txt.gz
Unzipped: ./wormbase_data/c_elegans.PRJNA13758.WS293.geneIDs.txt.gz
Processed file saved to: ./wormbase_data/c_elegans.PRJNA13758.WS293.geneIDs.csv
Downloaded: ./wormbase_data/c_elegans.PRJNA13758.WS293.expr_graph.csv.gz
Unzipped: ./wormbase_data/c_elegans.PRJNA13758.WS293.expr_graph.csv.gz
['RNASeq_controls_FPKM.dat', 'SRA_gene_expression.tar.gz', 'TAR_gene_expression.tar.gz', 'TSS.wig.tar.gz', 'affy_oligo_mapping.txt.gz', 'agil_oligo_mapping.txt.gz', 'alaska_ids.tsv.gz', 'anatomy_association.wb.gz', 'cdna2orf.txt.gz', 'changed_CGC_names.txt', 'confirmed_genes.fa.gz', 'development_association.wb.gz', 'disease_association.daf.txt.gz', 'expr_graph.csv.gz', 'functional_descriptions.txt.gz', 'geneIDs.txt.gz', 'geneOtherIDs.txt.gz', 'gene_association.wb.gz', 'gene_product_info.gpi.gz', 'gene_product_info.gpi2.gz', 'gsc_oligo_mapping.txt.gz', 'interactions.txt.gz', 'interpolated_clones

## Provide some summary info on Gene IDs

In [None]:
import pandas as pd

gene_ids_df = pd.read_csv(f'./wormbase_data/c_elegans.PRJNA13758.{wormbase_version}.geneIDs.csv') 
unique_gene_types = gene_ids_df["Gene_Type"].value_counts()
print(unique_gene_types)

### Download WormCat CSV File

This code downloads a WormCat CSV file from a wormcat.com URL and saves it to a designated output directory. 
It ensures that the directory exists before saving the file.


#### Example Execution:
- The function downloads the file `whole_genome_v2_nov-11-2021.csv` from the WormCat website and saves it to the `./wormbase_data` directory (or any other specified directory).

In [11]:
import os
import requests
import shutil

def _download_url(file_url, output_file_path):
    response = requests.get(file_url, stream=True)
    if response.status_code == 200:
        with open(output_file_path, 'wb') as f:
            shutil.copyfileobj(response.raw, f)
        print(f"Downloaded: {output_file_path}")
    else:
        print(f"Failed to download: {file_url} (status code: {response.status_code})")
    return

def download_wormcat_csv(output_dir="./"):
    url = "http://www.wormcat.com/static/download/whole_genome_v2_nov-11-2021.csv"
    output_filename = url.split("/")[-1]  # Get the filename from the URL

    os.makedirs(output_dir, exist_ok=True)
    output_file_path = os.path.join(output_dir, output_filename)

    if os.path.exists(output_file_path):
        print(f"File already exists: {output_file_path}. Skipping download.")
        return
    
    _download_url(url, output_file_path)
    print(f"File downloaded to: {output_file_path}")



In [None]:
download_wormcat_csv("./wormbase_data")

# Appendix

In [None]:
!pip install --upgrade pub_worm