# Create Batches of Wormbase Gene Ids

* This notebook creates functions that are coded and tested on a small sample of genes
* Once the code is validated the code is Lifted and Shifted to Nextflow for concurrent processing

In [38]:
!pip install --upgrade pub_worm

Collecting pub_worm
  Downloading pub_worm-0.2.4.tar.gz (385 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.5/385.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pub_worm
  Building wheel for pub_worm (setup.py) ... [?25ldone
[?25h  Created wheel for pub_worm: filename=pub_worm-0.2.4-py3-none-any.whl size=389854 sha256=1d09c037fd6d3c220bb91e02bfbf426a8009acb8a8087c6a794e61649c9b5e7e
  Stored in directory: /home/dan/.cache/pip/wheels/68/9f/54/07fbeafab5e8f7c105ea2c2126244ce71b449eefa43865c228
Successfully built pub_worm
Installing collected packages: pub_worm
  Attempting uninstall: pub_worm
    Found existing installation: pub-worm 0.2.3
    Uninstalling pub-worm-0.2.3:
      Successfully uninstalled pub-worm-0.2.3
Successfully installed pub_worm-0.2.4


In [7]:
import os
import time
from datetime import datetime
import math
import requests
import json
import csv
import psutil

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Get the API
from pub_worm.wormbase.wormbase_api import WormbaseAPI
from pub_worm.impact_factor.impact_factor_lookup import get_impact_factor
from pub_worm.ncbi.entreze_api import EntrezAPI


Full path to the file: /home/dan/miniconda3/envs/dan-dev-sc/lib/python3.9/site-packages/pub_worm/impact_factor/data/2022_JCR_IF.csv


In [4]:
# Get the most current Wormbase DB
def current_wormbase_version():
    api_url = f'http://rest.wormbase.org//rest/database/version'
    # Absolutley no error checking is done!!
    response = requests.get(api_url)
    json_data = json.loads(response.text)
    if  response.status_code == 200:
        return json_data['data']
    else:
        return {'error':'something is not right'}
    
current_wormbase_version()

'WS291'

# Get Wormbase Data From the Source

In [5]:
%%bash
# Pull down data from Wormbase and unzip
INPUT_DATA="./input_data"

get_wormbase_data() {
    local WORMBASE_VERSION="$1"
    local FILE_ROOT="$2"
    local BASE_FTP="ftp://ftp.wormbase.org/pub/wormbase/releases"
    local SPECIES_DIR="species/c_elegans/PRJNA13758/annotation"
    local FILE_PREFIX="c_elegans.PRJNA13758"

    wget -nv -P ${INPUT_DATA} ${BASE_FTP}/${WORMBASE_VERSION}/${SPECIES_DIR}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}
    gunzip -f ${INPUT_DATA}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}
}

# Pull down geneIDs.txt
get_geneids() {
    local WORMBASE_VERSION="$1"
    get_wormbase_data $WORMBASE_VERSION "geneIDs.txt.gz"
}

# Pull down functional_descriptions.txt
get_functional_descriptions() {
    local WORMBASE_VERSION="$1"
    get_wormbase_data $WORMBASE_VERSION "functional_descriptions.txt.gz"
}

create_geneids_csv() {
    local WORMBASE_VERSION="$1"
    local FILE_PREFIX="c_elegans.PRJNA13758"
    local FILE_ROOT="geneIDs.txt"
    gene_ids_txt="${INPUT_DATA}/${FILE_PREFIX}.${WORMBASE_VERSION}.${FILE_ROOT}"
    # Create GeneIDs.csv
    gene_ids_csv=$(echo "$gene_ids_txt" | sed 's/.\{4\}$//') # remove .txt
    gene_ids_csv="${gene_ids_csv}.csv"                       # add .csv

    # Drop the first column and Only include Live genes
    awk -F',' '$5=="Live" {print $2","$3","$4","$6}' "$gene_ids_txt" > "$gene_ids_csv"
    # Add Header line  
    sed -i '1iWormbase_Id,Gene_name,Sequence_id,Gene_Type' "$gene_ids_csv"
    echo created $gene_ids_csv    
}

# Get GeneId data from the old version used for WormCat and the latest version on wormbase
get_geneids "WS291"
create_geneids_csv "WS291"

# Get the get_functional_descriptions of genes for the lastest Wormbase version
# This is cool but we use the API as it has more data
#get_functional_descriptions "WS291"


2024-03-12 18:00:02 URL: ftp://ftp.wormbase.org/pub/wormbase/releases/WS291/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS291.geneIDs.txt.gz [416663] -> "./input_data/c_elegans.PRJNA13758.WS291.geneIDs.txt.gz" [1]


created ./input_data/c_elegans.PRJNA13758.WS291.geneIDs.csv


# Create Gene Id Batches from Source Data

* Load geneIDs.csv
* Drop piRNA
* Dead Genes have already been dropped by the `create_geneids_csv` process


In [6]:
# Create Wormbase Batches
input_data="./input_data"
file_name="c_elegans.PRJNA13758.WS291.geneIDs.csv"

gene_ids_df = pd.read_csv(f"{input_data}/{file_name}")
gene_ids_df = gene_ids_df[gene_ids_df["Gene_Type"] != "piRNA_gene"]
gene_ids_df.reset_index(drop=True, inplace=True)

# Calculate the batch size
batch_size = len(gene_ids_df) // 10

# Create a batch ID column
gene_ids_df = gene_ids_df.copy()
gene_ids_df["batch_id"] = (gene_ids_df.index // batch_size) + 1

# Iterate over unique batch IDs
for batch_id in gene_ids_df["batch_id"].unique():
    # Filter DataFrame for the current batch ID
    batch_df = gene_ids_df[gene_ids_df["batch_id"] == batch_id].copy()
    # Drop the batch_id column as it's no longer needed for individual batch files
    batch_df.drop("batch_id", axis=1, inplace=True)
    # Write the batch to a CSV file
    batch_df.to_csv(f"wb_batch_{batch_id}.csv", index=False)
    

# Test pub_worm single shot

In [None]:
# Single shot
wormbase_id = "WBGene00000914"
wormbase_api = WormbaseAPI("field", "gene", "references")
ret_data = wormbase_api.get_wormbase_data(wormbase_id)
pretty_data = json.dumps(ret_data, indent=4)
with open('result.json', 'w') as file:
        file.write(pretty_data)
print(pretty_data)

# Wormbase get reference papers


In [4]:
import logging
import logging.config

try:
    logging.config.fileConfig('logging.config')
except Exception:
    logging.basicConfig(filename='pub_worm_reference_data.log', level=logging.DEBUG)

logger = logging.getLogger(__name__)
# Function to iterate a list of Wormcat items
def get_reference_data(batch_file_nm):
    
    out_file_nm = batch_file_nm.replace(".csv", "_out.csv")
    wormbase_df = pd.read_csv(batch_file_nm)

    log_msg = f"Starting get_reference_data with {len(wormbase_df):,} entries"
    logger.debug(log_msg)

    wormbase_api = WormbaseAPI("field", "gene", "references")

    concatenated_df = pd.DataFrame()
    dfs = []
    index=0
    number_of_rows=len(wormbase_df)
    for df_index, row in wormbase_df.iterrows():
        wormbase_id = row['Wormbase_Id']
        index +=1
        #print(f"{index:<4} of {len(transmembrane_transport_df)} {row['wormbase_id']}")
        ret_data = wormbase_api.get_wormbase_data(wormbase_id)
        if 'references_list' in ret_data:
            if isinstance(ret_data['references_list'], dict):
                references_df = pd.DataFrame(ret_data['references_list'], index=[0])
            else:
                references_df = pd.DataFrame(ret_data['references_list'])
            if 'wbp_abstract' in references_df.columns:
                references_df = references_df.drop(columns=['wbp_abstract'])
            references_df['wormbase_id']=wormbase_id
            dfs.append(references_df)
        else:
            print("-", end='')
            #print(f"Return has no references_list!\n{ret_data}")

        # Concatenate every 100 DataFrames
        # If something crashes we may be able to recover without a full rerun
        if index % 100 == 0:
            log_msg = f"{index:>4} of {number_of_rows} {wormbase_id}"
            logger.debug(log_msg)
            concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
            concatenated_df.to_csv(out_file_nm, index=False)
            dfs = []  # Reset the list for the next batch

    # Concatenate the remaining DataFrames
    if dfs:
        print("final")
        concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
        concatenated_df.to_csv(out_file_nm, index=False)
    return concatenated_df



In [5]:
concatenated_df = get_reference_data("./wb_batch_11.csv")
concatenated_df


-final


Unnamed: 0,wbp_id,wbp_type,wbp_title,wbp_journal,wbp_year,wbp_author,wormbase_id
0,WBPaper00066009,Journal article,azyx-1 is a new gene that overlaps with zyxin ...,PLoS Biol,2023,Parmar BS|Kieswetter A|Geens E|Vandewyer E|Lud...,WBGene00306133


In [9]:
#params.reference_papers_csv, params.number_of_groups, params.number_of_batches
import logging
import logging.config

try:
    logging.config.fileConfig('logging.config')
except Exception:
    logging.basicConfig(filename='pub_worm_reference_data.log', level=logging.DEBUG)

logger = logging.getLogger(__name__)

def create_reference_paper_batches(file_name, output_dir, number_of_batches):
    gene_ids_df = pd.read_csv(file_name)
    gene_ids_df.reset_index(drop=True, inplace=True)

    # Calculate the batch size
    batch_size = len(gene_ids_df) // number_of_batches

    # Create a batch ID column
    gene_ids_df = gene_ids_df.copy()
    gene_ids_df["batch_id"] = (gene_ids_df.index // batch_size) + 1

    # Iterate over unique batch IDs
    for batch_id in gene_ids_df["batch_id"].unique():
        # Filter DataFrame for the current batch ID
        batch_df = gene_ids_df[gene_ids_df["batch_id"] == batch_id].copy()
        # Drop the batch_id column as it's no longer needed for individual batch files
        batch_df.drop("batch_id", axis=1, inplace=True)
        # Write the batch to a CSV file
        batch_df.to_csv(f"wb_batch_{batch_id}.csv", index=False)



In [12]:
reference_papers_csv="./input_data/wormbase_db/wb_reference_papers.csv"
create_reference_paper_batches(reference_papers_csv, "./", 4)


# Wormbase Aggregate Reference data

In [7]:
import os
import pandas as pd

def aggregate_reference_data(dir_to_search):
    # Initialize an empty DataFrame to collect the data
    concatenated_df = pd.DataFrame()
    
    # Search for files matching the pattern
    for filename in os.listdir(dir_to_search):
        if filename.startswith('wb_') and filename.endswith('_out.csv'):
            file_path = os.path.join(dir_to_search, filename)
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Concatenate the DataFrame to the collector DataFrame
            concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    # Write the concatenated DataFrame to a new CSV file
    out_file_nm = 'wb_reference_papers.csv'
    concatenated_df.to_csv(out_file_nm, index=False)

# Call the function to aggregate the data
dir_to_search="/home/dan/Code/NextFlow/watson_nextflow/watson_jobs/get_reference_data/results/references.bk"
aggregate_reference_data(dir_to_search)

# Get Pub Med IDs

In [22]:
reference_papers_df = pd.read_csv("./input_data/wormbase_db/wb_reference_papers.csv")
reference_papers_df = reference_papers_df[reference_papers_df['wbp_type']=='Journal article']
len(reference_papers_df)

103651

# Gene Ontology Data

In [15]:
#!/usr/bin/env python3

from pub_worm.wormbase.wormbase_api import WormbaseAPI
import os
import sys
import datetime
import pandas as pd

import logging
import logging.config

try:
    logging.config.fileConfig('logging.config')
except Exception:
    logging.basicConfig(filename='pub_worm_reference_data.log', level=logging.DEBUG)

logger = logging.getLogger(__name__)

def ontology_json_to_dataframe(json_obj, wormbase_id, file_name=None):
    rows = []
    row = []
    for category, cat_lst in json_obj.items():
        #print(f"{category=}")
        #print(f"{cat_lst=}")
        row = [wormbase_id]
        if isinstance(cat_lst, dict):
            row.append(cat_lst['go_id'])
            row.append(category)
            row.append(cat_lst['go_term'])
            rows.append(row)
            row = [wormbase_id]
        else:
            for cat_lst_item in cat_lst:
                #print(f"{cat_lst_item=}")
                row.append(cat_lst_item['go_id'])
                row.append(category)
                row.append(cat_lst_item['go_term'])
                rows.append(row)
                row = [wormbase_id]

    df = pd.DataFrame(rows)
    df.columns=["wormbase_id", "go_id", "go_category", "go_term"]
    if file_name:
        df.to_csv(file_name, index=False)
    return df

def get_ontology_data(batch_file_nm):
    out_file_nm = batch_file_nm.replace(".csv", "_out.csv")
    wormbase_df = pd.read_csv(batch_file_nm)

    log_msg = f"Starting get_reference_data with {len(wormbase_df):,} entries"
    logger.debug(log_msg)
    
    wormbase_api = WormbaseAPI("field", "gene", "gene_ontology_summary")
    
    concatenated_df = pd.DataFrame()
    dfs = []
    index=0
    number_of_rows=len(wormbase_df)
    for df_index, row in wormbase_df.iterrows():
        wormbase_id = row['Wormbase_Id']
        index +=1
        #print(f"{index:<4} of {len(transmembrane_transport_df)} {row['wormbase_id']}")
        ret_data = wormbase_api.get_wormbase_data(wormbase_id)
        if 'gene_ontology_summary' in ret_data:
            df = ontology_json_to_dataframe(ret_data['gene_ontology_summary'], wormbase_id)
            print(df)
            dfs.append(df)

        # Concatenate every 100 DataFrames
        # If something crashes we may be able to recover without a full rerun
        if index % 100 == 0:
            print(f"{index:>4} of {number_of_rows} {wormbase_id}")
            concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
            concatenated_df.to_csv(out_file_nm, index=False)
            dfs = []  # Reset the list for the next batch

    # Concatenate the remaining DataFrames
    if dfs:
        print("Writing")
        concatenated_df = pd.concat([concatenated_df] + dfs, ignore_index=True)
        concatenated_df.to_csv(out_file_nm, index=False)
    return concatenated_df


   
    
    


In [16]:
concatenated_df = get_ontology_data("./wb_batch_11.csv")
concatenated_df



       wormbase_id       go_id         go_category  \
0   WBGene00000914  GO:0000981  Molecular_function   
1   WBGene00000914  GO:0000978  Molecular_function   
2   WBGene00000914  GO:0003677  Molecular_function   
3   WBGene00000914  GO:0003700  Molecular_function   
4   WBGene00000914  GO:0005634  Cellular_component   
5   WBGene00000914  GO:0050829  Biological_process   
6   WBGene00000914  GO:0006357  Biological_process   
7   WBGene00000914  GO:0060179  Biological_process   
8   WBGene00000914  GO:0010468  Biological_process   
9   WBGene00000914  GO:0045944  Biological_process   
10  WBGene00000914  GO:0030182  Biological_process   
11  WBGene00000914  GO:0006355  Biological_process   
12  WBGene00000914  GO:0042427  Biological_process   

                                              go_term  
0   DNA-binding transcription factor activity, RNA...  
1   RNA polymerase II cis-regulatory region sequen...  
2                                         DNA binding  
3           DNA-bin

Unnamed: 0,wormbase_id,go_id,go_category,go_term
0,WBGene00000914,GO:0000981,Molecular_function,"DNA-binding transcription factor activity, RNA..."
1,WBGene00000914,GO:0000978,Molecular_function,RNA polymerase II cis-regulatory region sequen...
2,WBGene00000914,GO:0003677,Molecular_function,DNA binding
3,WBGene00000914,GO:0003700,Molecular_function,DNA-binding transcription factor activity
4,WBGene00000914,GO:0005634,Cellular_component,nucleus
5,WBGene00000914,GO:0050829,Biological_process,defense response to Gram-negative bacterium
6,WBGene00000914,GO:0006357,Biological_process,regulation of transcription by RNA polymerase II
7,WBGene00000914,GO:0060179,Biological_process,male mating behavior
8,WBGene00000914,GO:0010468,Biological_process,regulation of gene expression
9,WBGene00000914,GO:0045944,Biological_process,positive regulation of transcription by RNA po...
