# Explore the iCite API for gathering Publication data
2023-11-22 ZD  

Relevant Jira Ticket: [INS-790](https://tracker.nci.nih.gov/browse/INS-790)  

Exploratory notebook to investigate gathering Publications data for INS from [the iCite API](https://icite.od.nih.gov/api). This wiil build upon the work to in `notebooks/07_gather_publications.ipynb` and `modules/gather_publication_data.py`.  

The primary goal is to gather metrics specific to iCite: Citation Count and Relative Citation Ratio. The secondary goal is to explore whether the iCite API could replace the Biopython Entrez PubMed API, which is very slow. 

In [15]:
# Method to import from parent directory
import os
import sys
root_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(root_dir)

import requests
import pandas as pd
from tqdm import tqdm

# Get all existing publication functions
import modules.gather_publication_data as gpub

In [5]:
# Test imported functions
gpub.get_pmids_from_nih_reporter_api('R01CA263500', print_meta=True)

R01CA263500: (1/1): {'search_id': None, 'total': 7, 'offset': 0, 'limit': 500, 'sort_field': 'core_project_nums', 'sort_order': 'desc', 'sorted_by_relevance': False, 'properties': {}}


[{'coreproject': 'R01CA263500', 'pmid': 37138086, 'applid': 10679077},
 {'coreproject': 'R01CA263500', 'pmid': 36288726, 'applid': 10679077},
 {'coreproject': 'R01CA263500', 'pmid': 36917953, 'applid': 10679077},
 {'coreproject': 'R01CA263500', 'pmid': 36734849, 'applid': 10679077},
 {'coreproject': 'R01CA263500', 'pmid': 37059069, 'applid': 10679077},
 {'coreproject': 'R01CA263500', 'pmid': 35130560, 'applid': 10679077},
 {'coreproject': 'R01CA263500', 'pmid': 37024595, 'applid': 10679077}]

In [11]:
gpub.get_publication_info_from_pmid('37138086)')

{'publication_id': '37138086)',
 'title': 'Glioblastoma remodelling of human neural circuits decreases survival.',
 'authors': 'Saritha Krishna, Abrar Choudhury, Michael B Keough, Kyounghee Seo, Lijun Ni, Sofia Kakaizada, Anthony Lee, Alexander Aabedi, Galina Popova, Benjamin Lipkin, Caroline Cao, Cesar Nava Gonzales, Rasika Sudharshan, Andrew Egladyous, Nyle Almeida, Yalan Zhang, Annette M Molinaro, Humsa S Venkatesh, Andy G S Daniel, Kiarash Shamardani, Jeanette Hyer, Edward F Chang, Anne Findlay, Joanna J Phillips, Srikantan Nagarajan, David R Raleigh, David Brang, Michelle Monje, Shawn L Hervey-Jumper',
 'publication_year': '2023'}

In [7]:
# Basic iCite python example

response = requests.get(
    "/".join([
        "https://icite.od.nih.gov/api",
        "pubs",
        "23456789",
    ]),
)
pub = response.json()
print(pub)

{'pmid': 23456789, 'year': 2013, 'title': 'Hospital volume is associated with survival but not multimodality therapy in Medicare patients with advanced head and neck cancer.', 'authors': 'Arun Sharma, Stephen M Schwartz, Eduardo Méndez', 'journal': 'Cancer', 'is_research_article': 'Yes', 'relative_citation_ratio': 1.77, 'nih_percentile': 70.8, 'human': 1.0, 'animal': 0.0, 'molecular_cellular': 0.0, 'apt': 0.75, 'is_clinical': 'No', 'citation_count': 45, 'citations_per_year': 4.5, 'expected_citations_per_year': 2.547166821310601, 'field_citation_rate': 5.361749145554551, 'provisional': 'No', 'x_coord': 0.0, 'y_coord': 1.0, 'cited_by_clin': [25488965, 29180076], 'cited_by': [30186960, 34399637, 30220318, 37564472, 34795020, 28606602, 24123512, 36746098, 29100787, 26777060, 26553389, 25488965, 30194691, 35792549, 33556919, 27061951, 24706437, 29794540, 25042524, 28079775, 35547406, 32600116, 24488549, 31334365, 30409307, 35868508, 26868285, 29079897, 33449369, 32191271, 30698823, 25681489

### TRy a one-to-one replacement of the Entrez `get_publicatino_info_from_pmid` function

In [13]:
def get_publication_info_from_pmid_icite(pmid):
    """
    Get publication information for a given PMID using the iCite API.

    :param pmid: PubMed ID (str)
    :return: Dictionary containing publication information
    """
    try:
        # Use the iCite API to get publication data
        response = requests.get(f"https://icite.od.nih.gov/api/pubs/{pmid}")
        pub = response.json()

        # Extract relevant information
        publication_info = {
            'publication_id': pub.get('pmid', ''),
            'title': pub.get('title', ''),
            'authors': pub.get('authors', ''),
            'publication_year': pub.get('year', ''),
            'doi':pub.get('doi', ''),
            'citation_count': pub.get('citation_count', ''),
            'relative_citation_ratio': pub.get('relative_citation_ratio', ''),
        }

        return publication_info

    except Exception as e:
        # Use tqdm.write() instead of print() for long processes
        tqdm.write(f"Error fetching information for PMID {pmid} from iCite API: {e}")
        #print(f"Error fetching information for PMID {pmid} from iCite API: {e}")
        return None


In [28]:
get_publication_info_from_pmid_icite('37138086')

{'publication_id': 37138086,
 'title': 'Glioblastoma remodelling of human neural circuits decreases survival.',
 'authors': 'Saritha Krishna, Abrar Choudhury, Michael B Keough, Kyounghee Seo, Lijun Ni, Sofia Kakaizada, Anthony Lee, Alexander Aabedi, Galina Popova, Benjamin Lipkin, Caroline Cao, Cesar Nava Gonzales, Rasika Sudharshan, Andrew Egladyous, Nyle Almeida, Yalan Zhang, Annette M Molinaro, Humsa S Venkatesh, Andy G S Daniel, Kiarash Shamardani, Jeanette Hyer, Edward F Chang, Anne Findlay, Joanna J Phillips, Srikantan Nagarajan, David R Raleigh, David Brang, Michelle Monje, Shawn L Hervey-Jumper',
 'publication_year': 2023,
 'doi': '10.1038/s41586-023-06036-1',
 'citation_count': 21,
 'relative_citation_ratio': 10.5}

In [18]:
# Checkpoint loading instead of regathering data during development
pmid_filename = 'gathered_pmids_20231110.csv'
df_pmid = pd.read_csv(pmid_filename)

In [22]:
# Iterate through each unique PMID with tqdm progress bar
def get_pub_info_test_loop(df_pmid):

    df_pmid_info = pd.DataFrame()

    for pmid in tqdm(df_pmid['pmid'].unique(), 
                    #total=remaining_pmid_count, 
                    ncols=80):
        try:
            # Use PubMed API to get publication data
            publication_info = get_publication_info_from_pmid_icite(pmid)

            if publication_info:
                # Combine the information with the original DataFrame
                df_current = pd.DataFrame({
                    'pmid': pmid,
                    'title': publication_info['title'],
                    'authors': publication_info['authors'],
                    'publication_year': publication_info['publication_year'],
                    'doi': publication_info['doi'],
                    'citation_count': publication_info['citation_count'],
                    'relative_citation_ratio': publication_info['relative_citation_ratio']
                }, index=[0])

                # Add the current DataFrame to df_pmid_info
                df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

        except Exception as e:
            print(f"Error processing PMID {pmid}: {e}")
            # Fill in fields with NaN if not available
            df_current = pd.DataFrame({
                'pmid': pmid,
                'title': pd.NA,
                'authors': pd.NA,
                'publication_year': pd.NA,
                'doi': pd.NA,
                'citation_count': pd.NA,
                'relative_citation_ratio': pd.NA
            }, index=[0])

            # Add the current DataFrame to df_pmid_info
            df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

    return df_pmid_info

In [26]:
df_pmid_info_icite = get_pub_info_test_loop(df_pmid.head(1000))

  df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)
100%|█████████████████████████████████████████| 914/914 [13:41<00:00,  1.11it/s]


In [27]:
df_pmid_info_icite

Unnamed: 0,pmid,title,authors,publication_year,doi,citation_count,relative_citation_ratio
0,36127808,"Genetic ancestry, differential gene expression...","Freddy A Barragan, Lauren J Mills, Andrew R Ra...",2023,10.1002/cam4.5266,4,
1,29074302,Endogenous antibody responses to mucin 1 in a ...,"Janardan P Pandey, Aryan M Namboodiri, Bethany...",2018,10.1016/j.imbio.2017.10.028,2,0.09
2,31387361,Defects in the Exocyst-Cilia Machinery Cause B...,"Diana Fulmer, Katelynn Toomer, Lilong Guo, Kel...",2019,10.1161/CIRCULATIONAHA.119.038376,34,1.95
3,29027980,The Plasticizer Bisphenol A Perturbs the Hepat...,"Ludivine Renaud, Willian A da Silveira, E Star...",2017,10.3390/genes8100269,20,1.17
4,29309429,ShinyGPA: An interactive visualization toolkit...,"Emma Kortemeier, Paula S Ramos, Kelly J Hunt, ...",2018,10.1371/journal.pone.0190949,2,0.08
...,...,...,...,...,...,...,...
909,32276990,Mesenchymal and MAPK Expression Signatures Ass...,"Josh Lewis Stern, Grace Hibshman, Kevin Hu, Sa...",2020,10.1158/1541-7786.MCR-19-1244,17,1.13
910,32525984,Tissue- and development-stage-specific mRNA an...,"Anshuman Panda, Anupama Yadav, Huwate Yeerna, ...",2020,10.1093/nar/gkaa485,12,0.76
911,36371231,Differential regulation of TNFα and IL-6 expre...,"Ida Deichaite, Timothy J Sears, Leisa Sutton, ...",2022,10.1186/s12967-022-03731-x,2,0.57
912,37682073,Transcriptional subtypes of glottic cancer cha...,"Bharat A Panuganti, Christine Carico, Harishan...",2023,10.1002/hed.27514,0,


### Try processing in batches for fewer API calls
The above approach works but is even slower per iteration than the Entrez API (~15min for 1000). Try a batching approach where multiple PMIDs are sent in a single call.

In [37]:
def get_publication_info_from_pmid_icite_batch(pmids):
    """
    Get publication information for a list of PMIDs using the iCite API.

    :param pmids: List of PubMed IDs (str)
    :return: DataFrame containing publication information
    """
    try:
        # Join PMIDs into a comma-separated string
        pmid_str = ','.join(pmids)
        
        # Use the iCite API to get publication data for all PMIDs
        response = requests.get(f"https://icite.od.nih.gov/api/pubs?pmids={pmid_str}")
        pubs = response.json()

        # Initialize an empty list to store publication information for each PMID
        publication_info_list = []

        for pub in pubs:
            # Extract relevant information
            publication_info = {
                'pmid': pub.get('pmid', ''),
                'title': pub.get('title', ''),
                'authors': pub.get('authors', ''),
                'publication_year': pub.get('year', ''),
                'doi': pub.get('doi', ''),
                'citation_count': pub.get('citation_count', ''),
                'relative_citation_ratio': pub.get('relative_citation_ratio', ''),
            }
            
            # Add data to running list
            publication_info_list.append(publication_info)

        return pd.DataFrame(publication_info_list)

    except Exception as e:
        print(f"Error fetching information for PMIDs {pmids} from iCite API: {e}")
        return pd.DataFrame()

In [39]:
def get_pub_info_batched(df_pmid, batch_size=10):
    df_pmid_info = pd.DataFrame()

    # Extract unique PMIDs
    unique_pmids = df_pmid['pmid'].unique()

    # Split PMIDs into batches
    pmid_batches = [
        unique_pmids[i : i + batch_size].astype(str) for i in range(0, len(unique_pmids), batch_size)
    ]

    for batch in tqdm(pmid_batches, ncols=80):
        try:
            # Use iCite API to get publication data for the batch
            batch_info = [get_publication_info_from_pmid_icite(pmid) for pmid in batch]

            # Filter out None results (failed API calls)
            batch_info = [info for info in batch_info if info is not None]

            # Combine the information with the original DataFrame
            df_current = pd.DataFrame(batch_info)

            # Add the current DataFrame to df_pmid_info
            df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

        except Exception as e:
            print(f"Error processing batch of PMIDs: {e}")

    return df_pmid_info

In [42]:
# No batching
test = get_pub_info_test_loop(df_pmid.head(100))

  df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)
100%|█████████████████████████████████████████| 100/100 [01:22<00:00,  1.22it/s]


In [40]:
# Batch size default 10
test = get_pub_info_batched(df_pmid.head(100))

100%|███████████████████████████████████████████| 10/10 [01:17<00:00,  7.77s/it]


#### Compare timing of gathering iCite PMID info in single vs batched calls
Table gathers data from cells below

| API    | PMIDs | Batch Size| Time (mm:ss)| Rate (s/pmid) |
| ------:| -----:| ---------:| -----------:| -------------:|
| Entrez | 100   | None      | 00:34        | 0.34   |
| iCite  | 100   | None      | 01:22        | 0.82   |
| iCite  | 100   | 1         | 01:18        | 0.78   |
| iCite  | 100   | 5         | 01:18        | 0.78   |
| iCite  | 100   | 10        | 01:18        | 0.78   |
| iCite  | 100   | 50        | 01:19        | 0.79   |
| Entrez | 500   | None      | 02:08        | 0.26   |
| iCite  | 500   | None      | 07:09        | 0.86   |
| iCite  | 500   | 100       | 06:52        | 0.82   |



#### Summary
1. The Entrez API is 3-4 times faster than the iCite API
2. Batching the iCite API calls does not significantly improve performance

In [44]:
# 500 pmids, no batching
test = get_pub_info_test_loop(df_pmid.head(500))

  df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)
 98%|████████████████████████████████████████▎| 486/494 [06:45<00:06,  1.30it/s]

In [43]:
# 500 pmids, batch size 100
test = get_pub_info_batched(df_pmid.head(500), batch_size=100)

100%|█████████████████████████████████████████████| 5/5 [07:09<00:00, 85.85s/it]


In [47]:
# 100 pmids, batch size 5
test = get_pub_info_batched(df_pmid.head(100), batch_size=5)

100%|███████████████████████████████████████████| 20/20 [01:18<00:00,  3.92s/it]


In [48]:
# 100 pmids, batch size 50
test = get_pub_info_batched(df_pmid.head(100), batch_size=50)

100%|█████████████████████████████████████████████| 2/2 [01:18<00:00, 39.35s/it]


In [49]:
# 100 pmids, batch size 1
test = get_pub_info_batched(df_pmid.head(100), batch_size=1)

  df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)
100%|█████████████████████████████████████████| 100/100 [01:18<00:00,  1.27it/s]


In [50]:
# Iterate through each unique PMID with tqdm progress bar
def get_pub_info_test_loop_entrez(df_pmid):

    df_pmid_info = pd.DataFrame()

    for pmid in tqdm(df_pmid['pmid'].unique(), 
                    #total=remaining_pmid_count, 
                    ncols=80):
        try:
            # Use PubMed API to get publication data
            publication_info = gpub.get_publication_info_from_pmid(pmid)

            if publication_info:
                # Combine the information with the original DataFrame
                df_current = pd.DataFrame({
                    'pmid': pmid,
                    'title': publication_info['title'],
                    'authors': publication_info['authors'],
                    'publication_year': publication_info['publication_year'],
                    # 'doi': publication_info['doi'],
                    # 'citation_count': publication_info['citation_count'],
                    # 'relative_citation_ratio': publication_info['relative_citation_ratio']
                }, index=[0])

                # Add the current DataFrame to df_pmid_info
                df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

        except Exception as e:
            print(f"Error processing PMID {pmid}: {e}")
            # Fill in fields with NaN if not available
            df_current = pd.DataFrame({
                'pmid': pmid,
                'title': pd.NA,
                'authors': pd.NA,
                'publication_year': pd.NA,
                # 'doi': pd.NA,
                # 'citation_count': pd.NA,
                # 'relative_citation_ratio': pd.NA
            }, index=[0])

            # Add the current DataFrame to df_pmid_info
            df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

    return df_pmid_info

In [51]:
# 100 pmids, Entrez method
test = get_pub_info_test_loop_entrez(df_pmid.head(100))

100%|█████████████████████████████████████████| 100/100 [00:34<00:00,  2.89it/s]


In [53]:
# 100 pmids, Entrez method
test = get_pub_info_test_loop_entrez(df_pmid.head(500))

 89%|████████████████████████████████████▋    | 442/494 [01:52<00:10,  4.75it/s]

Error fetching information for PMID 33579955: list index out of range


 95%|███████████████████████████████████████  | 470/494 [02:02<00:04,  4.98it/s]

Error fetching information for PMID 33574288: list index out of range


100%|█████████████████████████████████████████| 494/494 [02:08<00:00,  3.86it/s]


#### See if pulling fewer fields from iCite is faster

In [61]:
def get_publication_info_from_pmid_icite(pmid, fields='all'):
    """
    Get publication information for a given PMID using the iCite API.

    :param pmid: PubMed ID (str)
    :return: Dictionary containing publication information
    """
    try:
        # Use the iCite API to get publication data
        if fields == ['all']:
            response = requests.get(f"https://icite.od.nih.gov/api/pubs/{pmid}")
        
        # If a list of fields is provided, include only those in the response
        else:
            field_str = ','.join(fields)
            response = requests.get(f"https://icite.od.nih.gov/api/pubs/{pmid}"
                                    f"&fl={field_str}")
        pub = response.json()

        # Extract relevant information
        publication_info = {
            'publication_id': pub.get('pmid', ''),
            # 'title': pub.get('title', ''),
            # 'authors': pub.get('authors', ''),
            # 'publication_year': pub.get('year', ''),
            'doi':pub.get('doi', ''),
            'citation_count': pub.get('citation_count', ''),
            'relative_citation_ratio': pub.get('relative_citation_ratio', ''),
        }

        return publication_info

    except Exception as e:
        # Use tqdm.write() instead of print() for long processes
        tqdm.write(f"Error fetching information for PMID {pmid} from iCite API: {e}")
        #print(f"Error fetching information for PMID {pmid} from iCite API: {e}")
        return None

# Iterate through each unique PMID with tqdm progress bar
def get_pub_info_test_loop(df_pmid, fields='all'):

    df_pmid_info = pd.DataFrame()

    for pmid in tqdm(df_pmid['pmid'].unique(), 
                    #total=remaining_pmid_count, 
                    ncols=80):
        try:
            # Use PubMed API to get publication data
            publication_info = get_publication_info_from_pmid_icite(pmid, fields)

            if publication_info:
                # Combine the information with the original DataFrame
                df_current = pd.DataFrame({
                    'pmid': pmid,
                    # 'title': publication_info['title'],
                    # 'authors': publication_info['authors'],
                    # 'publication_year': publication_info['publication_year'],
                    'doi': publication_info['doi'],
                    'citation_count': publication_info['citation_count'],
                    'relative_citation_ratio': publication_info['relative_citation_ratio']
                }, index=[0])

                # Add the current DataFrame to df_pmid_info
                df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

        except Exception as e:
            print(f"Error processing PMID {pmid}: {e}")
            # Fill in fields with NaN if not available
            df_current = pd.DataFrame({
                'pmid': pmid,
                # 'title': pd.NA,
                # 'authors': pd.NA,
                # 'publication_year': pd.NA,
                'doi': pd.NA,
                'citation_count': pd.NA,
                'relative_citation_ratio': pd.NA
            }, index=[0])

            # Add the current DataFrame to df_pmid_info
            df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

    return df_pmid_info

In [62]:
test = get_pub_info_test_loop(df_pmid.head(100), fields=['pmid', 'citation_count', 'doi', 'relative_citation_ratio'])

100%|█████████████████████████████████████████| 100/100 [01:19<00:00,  1.26it/s]


No notable change in runtime. 1:19m vs 1:22m to complete 100 pmids, gathering either a few or all fields