# External Programs Integated with Pub_worm

* Look at external programs and use pub_worm to solve problems
* Look at BioPythons

In [None]:
# Sytem level imports
import sys
import asyncio
import json
import inspect
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils import get_column_letter
from datetime import datetime, timedelta
import os

# Add pub_worm directory to the Python path
sys.path.insert(0, "/Users/dan/Code/Python/pub_worm")

from pub_worm.ncbi.entreze_api import EntrezAPI
from pub_worm.biorxiv.biorxiv_api import biorxiv_recent_posts_filtered

# Find where EntrezAPI is being load from
module = inspect.getmodule(EntrezAPI)
if hasattr(module, "__file__"):
    file_path = module.__file__
    print("EntrezAPI imported from:", file_path)
else:
    print("Could not determine the file path.")

# BioPython

In [None]:
import os
from Bio import Entrez
from bs4 import BeautifulSoup

Entrez.api_key = api_key = os.environ.get('NCBI_API_KEY', None)
Entrez.email = "daniel.higgins@yahoo.com"

# Call BioPython get a list of Databases that are managed by NCBI Entrez
stream = Entrez.einfo()
result = stream.read()
stream.close()

soup = BeautifulSoup(result, "xml")
db_name_tags = soup.find_all('DbName')
db_names = [db_name_tag.get_text(strip=True) for db_name_tag in db_name_tags]
print(db_names)


In [None]:
# Use Bio Python Entrez to get additional details on the NCBI Databases
# NOTE: This seems very slow??
for db_name in db_names:
    stream = Entrez.einfo(db=db_name)
    record = Entrez.read(stream)
    print(record)
    print("="*40)


# Apendix

In [None]:
# Build a networkx graph 

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

def build_legend_map(edge_labels):
    legend_map={'start_xxx':0}
    for value in list(edge_labels.values()):
        if value not in legend_map:
            max_val = max(list(legend_map.values()))
            legend_map[value] = max_val+1
    del legend_map['start_xxx']
    return legend_map

def plot_network_graph(df):

    # Create a directed graph
    G = nx.from_pandas_edgelist(df, 'source', 'target', edge_attr='edge', create_using=nx.Graph())

    # Draw the network diagram with a larger figure size
    plt.figure(figsize=(40, 20))  # Set the figure size to 12x8 inches
    pos = nx.spring_layout(G)  # positions for all nodes
    nx.draw(G, pos, with_labels=True, node_size=4000, node_color='skyblue', font_size=9, font_color='black', edge_color='gray', linewidths=0.5, arrows=False)

    # Add edge labels
    edge_labels = nx.get_edge_attributes(G, 'edge')
    print(type(edge_labels))
    # print(edge_labels)
    # legend_map = build_legend_map(edge_labels)
    # edge_labels_mapped = {}
    # for edge_label in edge_labels:
    #     edge_label_value = edge_labels[edge_label]
    #     map_value = legend_map[edge_label_value]
    #     edge_labels_mapped[edge_label]=map_value
    # print(legend_map)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

    # y_pos = 0.0
    # for key, value in legend_map.items():
    #     plt.text(0.0, y_pos, f"{value} = {key}", fontsize=12)
    #     y_pos -= 0.1  # Adjust the y position for the next text


    plt.show()


In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('output/test.csv')
edge_names = [
    'Phenols',
    'Lactones',
    'Organic carbonic acids and derivatives',
    '6-oxopurines'
]
selected_rows = df[df['edge'].isin(edge_names)]
selected_rows.to_csv("output/test1.csv")
plot_network_graph(selected_rows)

In [None]:
import pandas as pd

slim_metabolite_df = pd.read_csv("output/slim_metabolite.csv")
slim_metabolite_t_df = slim_metabolite_df.T
slim_metabolite_t_df.to_csv("output/slim_motabolite_t.csv",index_label='motabolite')
slim_metabolite_t_df

In [None]:
import pandas as pd


# Iterate over each row and create a list of dictionaries
list_of_dicts = []
for idx, row in slim_metabolite_t_df.iterrows():
    cleaned_row = row.dropna().tolist()
    row_dict = {str(idx): cleaned_row}
    list_of_dicts.append(row_dict)

# Print the list of dictionaries
print(list_of_dicts)


In [None]:
edges_to_ignore = ['Chemical entities', 'Hydrocarbon derivatives', 'Organic compounds', 'Organic oxygen compounds', 
                   'Organooxygen compounds', 'Organic oxides', 'Organic acids and derivatives', 'Organonitrogen compounds', 
                   'Organopnictogen compounds', 'Organic nitrogen compounds']
def shares_edge(source_edge, target):
    ret_val=False
    target_nm = list(target.keys())[0]
    target_edges = target[target_nm]
    if source_edge in target_edges:
        ret_val = True
    return ret_val


graph_list = []
for index, list_of_dict in enumerate(list_of_dicts):
    source_nm = list(list_of_dict.keys())[0]
    source_edges  = list_of_dict[source_nm]
    #print(f"{source_nm=} {source_edges=}")
    targets = list_of_dicts[index+1:]
    for source_edge in source_edges:
        if source_edge not in edges_to_ignore:
            for target in targets:
                target_nm = list(target.keys())[0]
                if shares_edge(source_edge, target):
                    graph_list.append({'source':source_nm,'target':target_nm,'edge':source_edge})

graph_df = pd.DataFrame(graph_list)
graph_df.head()
graph_df.to_csv("output/test.csv",index=False)

### Get full text for Journal Articles related to a Wormbase Id

In [None]:
from pub_worm.ncbi.entreze_api import EntrezAPI
from pub_worm.wormbase.wormbase_api import WormbaseAPI

async def get_pmid_for_wbpid(reference):
    wormbase_api = WormbaseAPI("field", "paper", "pmid")
    pmid = wormbase_api.get_wormbase_data(reference['wbp_id'])
    return {**reference, **pmid}


In [None]:
def update_unique_wbp_ids(references, unique_references):
    """
    Updates the set of unique wbp_ids with new IDs from the list of references
    and returns the list of references that have not been seen before.
    """
    new_references = []
    
    # Iterate through the references
    wbp_ids = set(unique_references.keys())
    for ref in references:
        wbp_id = ref["wbp_id"]
        # If the wbp_id is not in the set, add it to the set and add the reference to the new list
        if wbp_id not in wbp_ids:
            #print(ref)
            unique_references[wbp_id]={'title':ref['wbp_title'], 'abstract':ref.get('wbp_abstract',""),'pmcid':0}
            new_references.append(ref)
    
    return new_references, unique_references


In [None]:

async def get_references_wbid(wormbase_id, unique_references):
    results_file_nm = f"./output/{wormbase_id}.json"
    print(f"Processing {wormbase_id}")
    wormbase_api = WormbaseAPI("field", "gene", "references")
    
    # 1. Get all the references for the given wormbase_id
    wormbase_data = wormbase_api.get_wormbase_data(wormbase_id)
    
    # 2a. collect only Journal articles
    if isinstance(wormbase_data['references_list'], dict):
        references = [wormbase_data['references_list']] # Make sure we have a list
    else:
        references = wormbase_data['references_list']        
    journal_articles = [ref for ref in references if ref['wbp_type'] == 'Journal article' ]
    
    # 2b. collect only articles that we have not seen
    journal_articles, unique_references = update_unique_wbp_ids(journal_articles, unique_references)

    # 3a. Get the associated Pubmed Ids
    pmid_for_wbpid_list = await asyncio.gather(*[get_pmid_for_wbpid(ref) for ref in journal_articles])
    # 3b. Create a lookup table for pmid to bwpid
    pmid_to_bwpid_lookup = {pmid_for_wbpid['pm_id']: pmid_for_wbpid['wbp_id'] for pmid_for_wbpid in pmid_for_wbpid_list}
    # 3c. Add PubMed Ids to the unique_references
    for pmid_for_wbpid in pmid_for_wbpid_list:
        wbp_id = pmid_for_wbpid['wbp_id']
        unique_references[wbp_id]['pmid']= pmid_for_wbpid['pm_id']

    # 4. Extract the pubmed ids into a list
    pmid_list = [pmid_for_wbpid['pm_id'] for pmid_for_wbpid in pmid_for_wbpid_list]
    
    # 5. Post the list to ncbi entrez
    ncbi_api = EntrezAPI()
    entreze_epost_result = ncbi_api.entreze_epost(pmid_list)
    
    # 6. Fetch the full articles
    if 'WebEnv' in entreze_epost_result:
        # 6a. Link pubmed ids to pmcids
        elink_result = ncbi_api.entreze_elink_pmid_to_pmcid(entreze_epost_result)
        params= {'db': 'pmc'}
        # 6b. post the pmcids
        epost_result = ncbi_api.entreze_epost(elink_result, params)
        # 6c. Fetch the articles based on the pmcids
        efetch_result = ncbi_api.entreze_efetch(epost_result)
        
        # 6d. Write the content of the paper to a file
        for article in efetch_result['articles']:
            content = f"Title:{article['title']}\n"
            content += f"Abstract: {article['abstract']}\n"
            content += f"Content: {article['body']}\n"
            file_nm = f"./output/PMC{article['pmcid']}.txt"
            with open(file_nm, 'w') as file:
                file.write(content)
            wbp_id = pmid_to_bwpid_lookup[article['pmid']]
            unique_references[wbp_id]['pmcid'] = article['pmcid']
            
    
    # #print(f"unique_references {len(unique_references)}")
    #print(json.dumps(journal_articles, indent=4))
    return unique_references

def pmc_not_found(unique_references):
    for unique_reference in unique_references.values():
        if unique_reference['pmcid'] == 0:
            content = f"Title:{unique_reference['title']}\n"
            content += f"Abstract: {unique_reference['abstract']}\n"
            content += f"Content: \n"
            file_nm = f"./output/PM{unique_reference['pmid']}.txt"
            with open(file_nm, 'w') as file:
                file.write(content)
            
            



In [None]:
#gene_set = ["WBGene00008850","WBGene00001463"]
gene_set = ["WBGene00016064", "WBGene00001463", "WBGene00001452", "WBGene00002048", "WBGene00003750", "WBGene00006575", "WBGene00006783",
            "WBGene00019327", "WBGene00008850", "WBGene00019967", "WBGene00001452", "WBGene00001752", "WBGene00002048", "WBGene00003640", 
            "WBGene00007867", "WBGene00007875", "WBGene00008010", "WBGene00008584", "WBGene00008681", "WBGene00009429", "WBGene00016596", 
            "WBGene00019619", "WBGene00010290", "WBGene00000399", "WBGene00001430", "WBGene00010493", "WBGene00004512", "WBGene00004513", 
            "WBGene00004622"]
unique_references = {}
for wormbase_id in gene_set:
    unique_references = await get_references_wbid(wormbase_id, unique_references)

pmc_not_found(unique_references)

print(json.dumps(unique_references, indent=4))
with open("./output/unique_references.json", 'w') as file:
    json.dump(unique_references, file, indent=4)
    

In [None]:
data = ["35036864", "32292107"]

ncbi_api = EntrezAPI()
entreze_epost_result = ncbi_api.entreze_epost(data)

if 'WebEnv' in entreze_epost_result:
    elink_result = ncbi_api.entreze_elink_pmid_to_pmcid(entreze_epost_result)
    params= {'db': 'pmc'}
    epost_result = ncbi_api.entreze_epost(elink_result, params)
    efetch_result = ncbi_api.entreze_efetch(epost_result)
    pretty_data = json.dumps(efetch_result, indent=4)
    print(pretty_data)



## Other

In [None]:
articles = biorxiv_recent_posts_filtered()
for article in articles:
    print(f"Archive Date: {article['date']}")
    print(f"Title:{article['title']}")
    print(article['doi'])
    print("")



In [None]:
import requests
import xml.etree.ElementTree as ET

# URL to fetch XML data
url = "https://connect.biorxiv.org/biorxiv_xml.php?subject=all"

# Function to fetch and parse the XML
def fetch_biorxiv_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f"Failed to retrieve data: {response.status_code}")
        
biorxiv_data = fetch_biorxiv_data(url)

root = ET.fromstring(biorxiv_data)
for item in root.findall('.//item'):
     print(item)
print(biorxiv_data)

In [None]:
import requests
from bs4 import BeautifulSoup

# URL to fetch XML data
url = "https://connect.biorxiv.org/biorxiv_xml.php?subject=all"

# Keywords to search in titles and descriptions
keywords = ["caenorhabditis", "elegans"]

# Function to fetch the XML data
def fetch_biorxiv_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f"Failed to retrieve data: {response.status_code}")

# Function to search for keywords in text
def contains_keywords(text, keywords):
    return any(keyword.lower() in text.lower() for keyword in keywords)

# Function to parse the XML using BeautifulSoup and extract relevant articles
def parse_biorxiv_xml(xml_data, keywords):
    soup = BeautifulSoup(xml_data, "xml")
    articles = []

    # Iterate over each <item> in the XML
    for item in soup.find_all('item'):
        title = item.find('title').get_text(strip=True)
        description = item.find('description').get_text(strip=True)
        dc_date = item.find('dc:date').get_text(strip=True)
        dc_identifier = item.find('dc:identifier').get_text(strip=True)
        
        # Check if either the title or description contains any of the keywords
        if contains_keywords(title, keywords) or contains_keywords(description, keywords):
            # Append the article details as a dictionary
            articles.append({
                'title': title,
                'date': dc_date,
                'doi': f"https://doi.org/{dc_identifier}"
            })

    return articles

# Main function to fetch, parse, and filter data
def main():
    # Fetch the XML data from the URL
    xml_data = fetch_biorxiv_data(url)
    
    # Parse the XML and search for relevant articles
    articles = parse_biorxiv_xml(xml_data, keywords)
    
    # Print the JSON list of filtered articles
    if articles:
        print(articles)
    else:
        print("No articles found with the given keywords.")

if __name__ == "__main__":
    main()