In [1]:
#mandatory imports
import time
from Bio import Entrez
import pandas as pd
import numpy as np
import spacy
import urllib.request, urllib.error, urllib.parse
import json
print("Done.")

Done.


In [2]:
## Thesaurus declaration
thesaurus = {
    "Ivermectin" : ["IVM", "Ivermectin", "Avermectin", "avermectin", "ivermectin", "stromectol", "Stromectol", "Eqvalan", "Ivomec", "Mectizan", "Dihydroavermectin", "MK 933", "MK-933", "MK933", "C48H74O14", "IV", "IVM-654", "IVR-25", "IV-104", "IVE-11", "IVER-15"],
    "GABA" : ["GABA", "GABAergic", "gamma-aminobutyric acid"],
    "Zebrafish" : ["Zebrafish", "Danio rerio"],
    "COVID-19" : ["COVID-19", "COVID", "SARS-CoV-2"],
    "Glutamate" : ["Glutamate", "glutamate","Glu", "L-(+)-glutamate","L-Glu", "L-Glutamate", "L-glutamate", "L glutamate", "L glutamate"]
}
print("Done.")

Done.


In [3]:
def global_db_search(query_list, genes_list=[]):

    if len(genes_list) != 0:
        for i in query_list:
            for j in genes_list:
                query = i + j + "[tiab]"

                handle = Entrez.egquery(term=query)
                record = Entrez.read(handle)
                df = pd.DataFrame(record["eGQueryResult"]).head(2)
                df["Query"] = query
                append_data(df, 'global_query_res.csv', False)
                time.sleep(0.34)
    else:
        for i in query_list:
            handle = Entrez.egquery(term=i)
            record = Entrez.read(handle)
            df = pd.DataFrame(record["eGQueryResult"]).head(2)
            df["Query"] = i
            append_data(df, 'global_query_res.csv', False)
            time.sleep(0.34)
    return


## Function for reading in the df "summary" results
def read_in_results(file_name):

    # The converters are there so that each list is NOT inside a string
    res_df = pd.read_csv(file_name,  converters={"MainID_List": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "P_Dates": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "P_Years": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "LinkedID_List": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "Query_Count": int})
    return res_df


def esummary_info(in_webenv_key, in_query_key, db_name):

    # Obtaining DocSums for a set of IDs that are stored on the Entrez History server.
    handle = Entrez.esummary(db=db_name, webenv=in_webenv_key, query_key=in_query_key)
    record = Entrez.read(handle)

    publ_dates, publ_years = get_published_dates(record)

    if db_name == "pubmed":
        ids_list = get_pmcids(record)
    else:
        ids_list = get_pmids(record)

    return publ_dates, publ_years, ids_list


def get_published_dates(esummary_rec):

    retr_dates = []
    retr_years = []
    check = True
    i = 0
    for article in esummary_rec:
        # "PubDate" is often of the form: '2021 Nov 26'
        retr_dates.append(article["PubDate"])
        date = article["PubDate"].split()
        while(i < 2):
            if len(date[i]) == 4:
                p_year = int(date[i])
                i = 3
            else:
                i = i + 1
        #p_year = int(article["PubDate"].split()[0])
        #p_year = int(article["P_Years"].split()[0])
        #changed from PubDate to P_Years because of error when changing search term
        retr_years.append(p_year)

    return retr_dates, retr_years


def get_pmcids(esummary_rec):

    pmcids_list = []
    for i in esummary_rec:
        # If "pmc" is there, then this article also has a PMCID (i.e., it's also found in the PubMed Central db)
        if "pmc" in i["ArticleIds"]:
            pmcids_list.append(i["ArticleIds"]["pmc"])
        else:
            pmcids_list.append(np.NaN)

    return pmcids_list


def get_pmids(esummary_rec):

    pmids_list = []
    for i in esummary_rec:
        # '0' means that the article has no PMID (i.e., it's not found in the PubMed db)
        if i["ArticleIds"]["pmid"] == '0':
            pmids_list.append(np.NaN)
        else:
            pmids_list.append(i["ArticleIds"]["pmid"])

    return pmids_list


## Function that retrieves summary results from a given set of queries (which don't require a gene list)
def get_query_info_no_genes(query_in, db_name):

    # relevance: Records are sorted based on relevance to your search. (Relevance ranking)
    search_results = Entrez.read(
        Entrez.esearch(db=db_name, term=query_in, sort="relevance", retmax=5000, usehistory="y")
        )

    # NEED TO FIRST CHECK IF WE GOT ANY RESULTS FROM THAT QUERY
    if len(search_results["IdList"]) == 0:
        print("No Results.")
        return
    else:
        # With search_results, we will use its WebEnv value and QueryKey value
        p_dates, p_years, ids_list = esummary_info(search_results["WebEnv"], search_results["QueryKey"], db_name)

        time.sleep(0.34)

        return pd.DataFrame([[query_in, db_name, search_results['Count'], search_results['IdList'], p_dates, p_years, ids_list]],
                                columns=['Query', 'Db_Name', 'Query_Count', 'MainID_List', 'P_Dates', 'P_Years', 'LinkedID_List'])


## Function that retrieves summary results from a given set of queries (which requires a gene list)
def get_query_info(query_in, genes, db_name):

    gene_query = []
    query = ""

    for i in genes:
        # Example of db_name values in this use case: "pubmed" or "pmc"
        if db_name == "pubmed":
            # PubMed's Search field tag: Title/Abstract [tiab]
            query = query_in + i + "[tiab]"
        else:
            query = query_in + i

        # relevance: Records are sorted based on relevance to your search. (Relevance ranking)
        search_results = Entrez.read(
            Entrez.esearch(db=db_name, term=query, sort="relevance", retmax=5000, usehistory="y")
            )
     
        # NEED TO FIRST CHECK IF WE GOT ANY RESULTS FROM THAT QUERY
        if len(search_results["IdList"]) == 0:
            continue

        # With search_results, we will use its WebEnv value and QueryKey value
        p_dates, p_years, ids_list = esummary_info(search_results["WebEnv"], search_results["QueryKey"], db_name)
       
        gene_query.append([query, db_name, search_results['Count'], search_results['IdList'], p_dates, p_years, ids_list])
        time.sleep(0.34)
          
    return pd.DataFrame(gene_query, columns=['Query', 'Db_Name', 'Query_Count', 'MainID_List', 'P_Dates', 'P_Years', 'LinkedID_List'])


## Function for obtaining citation counts for the set of IDs found in the "summary" df
def cited_cnt_table(df_summary, db_name):

    elink_data = []
    link_name = ""

    if db_name == "pubmed":
        link_name = "pubmed_pubmed_citedin"
    else:
        link_name = "pmc_pmc_citedby"  # "pmc" is the other db_name in this use case

    for i in range(0, len(df_summary)):

        query_term = df_summary.iloc[i]["Query"]

        for id_num in df_summary.iloc[i]["MainID_List"]:

            record = Entrez.read(Entrez.elink(id=id_num, dbfrom=db_name, db=db_name, linkname=link_name))
         
            if len(record[0]["LinkSetDb"]) != 0:
                cited_counts = len(record[0]["LinkSetDb"][0]["Link"])
            else:
                # 'LinkSetDb' key contains empty list when an article has no citation counts
                cited_counts = 0
            elink_data.append([query_term, db_name, id_num, cited_counts])

            if (df_summary.iloc[i]["MainID_List"].index(id_num) + 1) % 3 == 0:
                time.sleep(0.34)

    return pd.DataFrame(elink_data, columns=["Query", "Db_Name", "Id_List", "Citation_Cnts"]) 


## Function that returns the Top-k results (pass in k as an argument to the function, input by the user)
def get_top_k(df, k_val):

    q_top_k = []

    for q in df["Query"].unique():
        matches_ids = []  # For each query version, these are the IDs meeting the criteria of having citation counts >= 25
        counts = []
        df_temp = df[df["Query"] == q]

        for i in range(0, len(df_temp)):
            if df_temp.iloc[i]["Citation_Cnts"] >= 25:
                matches_ids.append(int(df_temp.iloc[i]["Id_List"]))
                counts.append(df_temp.iloc[i]["Citation_Cnts"])
                if len(matches_ids) == k_val:
                    break
        if len(matches_ids) == 0:
            continue
        q_top_k.append([q, matches_ids, counts])

    top_k_df = pd.DataFrame(q_top_k, columns=["Query", "Top_"+str(k_val)+"_Ids", "Citation_Cnts"])

    return top_k_df


## Function that appends DataFrame rows to a CSV file
def append_data(df, file_name, is_new_file):

    if is_new_file:
        # if True, then
        df.to_csv(file_name, index=False)
    else:
        # False: This is an existing CSV file
        df.to_csv(file_name, mode='a', index=False, header=False)

    return

In [4]:
# Load in the model for English
nlp = spacy.load("en_core_web_sm")
# Can't retrieve XML if you don't have a query.
query = "GABA AND Glutamate"
Entrez.email = "n01365801@unf.edu.com"
df_q_pubmed = get_query_info_no_genes(query, "pubmed")
print("Done.")

Done.


In [5]:
"""
Documentation for fetch_pubmed()

This function's purpose is to use NCBI's E-Utils to get the body of articles, given an id.
The E-Util used in E-Fetch.
Future Work on this could be adjusting the argument, to allow for just a list of IDs, instead of a Pandas DataFrame slice.

Arguments:
    * ids: The IDs of articles from a Pandas DataFrame
Return Value: A list of the records.
"""
def fetch_pubmed(ids): 
    records_pubmed = []
    # Fetch all records pertaining to our queries.
    for row in ids:
        for uid in row:
            handle = Entrez.efetch(db="pubmed", id=uid, retmode="xml")
            record = Entrez.read(handle, validate=False)
            records_pubmed.append(record)
    # Be polite and flush/close the stream like a good programmer.
    handle.close()
    return records_pubmed
print("Done.")

Done.


In [6]:
records_pubmed = fetch_pubmed(df_q_pubmed['MainID_List'])
print("Done.")

Done.


In [7]:
#print(records_pubmed[0])

In [8]:
"""
Documentation for invert_dict()

This function's sole purpose is to invert a dictionary, so that the values of the old are the keys of the new,
and the keys of the old are the values of the new, in list format.

Arguments:
    dictionary: The dictionary to be inverted.
Return Value: The inverted dictionary following the above design.
"""
def invert_dict(dictionary):
    dict_inverted = {} # output
    for (k, v) in dictionary.items():
        if v in dict_inverted.keys():
            dict_inverted[v].append(k)
        else:
            dict_inverted[v] = [k]
            
    return dict_inverted
print("Done.")

Done.


In [9]:
"""
Documentation for find_comentions()

Future Work for this function includes generalizing it to be able to handle both PubMed and PMC.
This will likely require some work on reranking() below, as it only handles PubMed formatted XML,
due to issues with PMC and Biopython present while writing this code.

Arguments:
    * thes: A dictionary containing the synonyms of query terms.
    * doc: A SpaCy Doc object that contains the text we are looking at.
Return Value: A tuple in the form of (sentences, proximity list)
"""
def find_comentions(thes, doc):
    sentences = []
    proximity_list = []
    for sentence in doc.sents:
        prev_term = ""
        term_seen = False
        first_i = 0
        for word in sentence:
            for term in thes.keys():
                if term_seen:
                    if (word.text in thes[term]) and (word.text not in thes[prev_term]):
                        proximity_list.append(int(word.i - first_i))
                        first_i = word.i
                        prev_term = term
                        sentences.append(str(sentence))
                        break
                elif (word.text in thes[term]):
                    term_seen = True
                    first_i = word.i
                    prev_term = term
                    break
    
    return (str(sentences), proximity_list)
print("Done.")

Done.


In [66]:
"""
Documentation for reranking()

This function will use find_comentions(), invert_dict(), and word_proximity() to create a general ranking of the articles.
The articles will be identified by UID.

Future additions to this reranking function include using Ms. Victoria's get_top_k() and cited_cnt_table() functions to add in the
25 citation requirement for credibility. Other work includes breaking out some functionality into other functions in order to clean up the mess.

Arguments:
    * records: A list of XML Objects returned by E-Fetch.
    * query_terms: A list of query_terms. These then get selected out of the Thesaurus.
Return Value: The rankings for the records. Type is a Pandas DataFrame.
"""
### TODO: Break apart this function into smaller functions
    # Namely, creation of combined_criteria, creation of top_15, and creation of DataFrame
def reranking(records, query_terms):
    # snippet from https://stackoverflow.com/questions/29216889/slicing-a-dictionary
    # While in the answer, they add a check to make sure the key is in original dict
    # (the thesaurus in this case), it is safe to assume that the key is in the dict,
    # because the thesaurus should contain all possible terms.
    inner_thesaurus = {k:thesaurus[k] for k in query_terms}

    # DataFrame Data
    pmids = []
    titles = []
    abstracts = []
    relevancy_score = {}
    comention_sentences = []
    i = 0
    j = 0
    for record in records:
        # Some records do not have an Abstract (???) so we need to check for an abstract
        # TODO: Deal with the articles that don't have abstracts
        try:
            pmid = str(record['PubmedArticle'][0]['MedlineCitation']['PMID'])
            article_keys = record['PubmedArticle'][0]['MedlineCitation']['Article'].keys()
            if 'Abstract' in article_keys: # We have an Abstract
                # TODO: Stop rewriting this indexing mess every time.
                abstract_text = str(record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'])
                # Process the abstract
                doc = nlp(abstract_text)

                (comention_sents, proximity_count) = find_comentions(doc=doc, thes=inner_thesaurus)

                # DataFrame data collection
                pmids.append(pmid)
                titles.append(str(record['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']))
                abstracts.append(abstract_text)
                comention_sentences.append(comention_sents)
                if (len(proximity_count) != 0):
                    relevancy_score[pmid] = sum([1/count for count in proximity_count])
                else:
                    relevancy_score[pmid] = 0
            i = i + 1
            j = j + 1
        except:
            i = i + 1
            j = j + 1
            print("Record indexed at " + str(i) + " improperly formatted.")
    print("Num records proccessed: " + str(j))
    # Relevancy score will be used to calculate the index.
    prerankings = pd.DataFrame(data=[pmids, titles, abstracts, relevancy_score.values(), comention_sentences]).transpose()
    prerankings.columns = ["PMID", "Title", "Abstract", "Relevancy Score", "Comention Sentences"]
    
    
    #formatting for output
    
    
    relevancy_score_inverted = invert_dict(relevancy_score)
    # Gathering the top 15 abstracts based on the relevancy score.
    # We conglomerate all of the scores into a single list, flatten it, and then simply slice out the first 15 elements.
    top_15 = []
    for i in sorted(relevancy_score_inverted.keys(), reverse=True):
        top_15.append(relevancy_score_inverted[i])
    # This list comprehension is flattening the list of lists produced by combined_criteria filtering.
    top_15 = [x for xs in top_15 for x in xs]
    top_15 = top_15[:15]
    #prerankings.astype({'Comention Sentences': 'string'}).dtypes
    prerankings["Comention Sentences"] = prerankings["Comention Sentences"].apply(lambda x: "".join(str(x)))
    prerankings["Comention Sentences"] = prerankings["Comention Sentences"].apply(lambda x: str(x).strip("[]"))
    #prerankings["Title"] = prerankings["Title"].apply(lambda x: str(x).strip("[]"))
    
    
    # Now that we have the UIDs of our top 15, we can grab them
    rankings = prerankings[prerankings["PMID"].isin(top_15)].sort_values(by="Relevancy Score", ascending=False).reset_index(drop=True)
    
    return rankings

# FIXME: Do NOT hardcode the query_terms, pull them in from the query.
rankings = reranking(records_pubmed, ["GABA","Glutamate"])

Record indexed at 4838 improperly formatted.
Num records proccessed: 5000


In [67]:
rankings
print("Done.")

Done.


In [68]:
# CSV output
rankings.to_csv("Glutamate and GABA3.csv")
print("Done.")

Done.
