# Revise Publications
2023-12-06 ZD  

Testing small changes to publication process before finalizing the workflow. 

Part 1: Setup  
Part 2: Fix Medline Publication Date: [INS-806](https://tracker.nci.nih.gov/browse/INS-806)  
Part 3: Add Range Fields [INS-819](https://tracker.nci.nih.gov/browse/INS-819)  

In [1]:
# Method to import from parent directory
import os
import sys
root_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(root_dir)
import config
import re
from datetime import datetime

import requests
import pandas as pd
from tqdm import tqdm
import numpy as np
import json # for parsing full API response

# Get all existing publication functions
import modules.gather_publication_data as gpub

---TIMESTAMP OVERRIDE IN USE---
---Disable this with comments in config.py for default behavior---


# Part 1: Setup

In [2]:
df_pmids = pd.read_csv('gathered_pmids_20231110.csv')
df_pmids.sort_values(by='pmid', inplace=True)
df_pmids

Unnamed: 0,coreproject,pmid,applid
57090,P30CA016042,1279509,10892417
108165,P30CA043703,1280555,10784804
109340,P30CA043703,1281066,10784804
107499,P30CA042014,1282437,10890420
91299,P30CA021765,1283327,10892444
...,...,...,...
148765,U01CA253858,37947334,10693842
148794,U01CA253911,37947335,10674788
49239,P30CA014520,37947335,10905065
148800,U01CA253911,37947337,10674788


In [3]:
# Set up testing PMIDs
pmid_start = 100000
test_range = 1

pmid_range = f"{pmid_start}:{pmid_start+test_range}"
unique_pmids = df_pmids['pmid'].unique().tolist()
test_pmids = unique_pmids[pmid_start:(pmid_start+test_range)]

print(f"Unique PMIDs (all):        {len(unique_pmids):>14}")
print(f"Unique PMID test count:    {test_range:>14}")
print(f"PMID range:                {pmid_range:>14}")

Unique PMIDs (all):                144658
Unique PMID test count:                 1
PMID range:                 100000:100001


### Set up testing that mimics small-scale publication gathering

In [4]:
df_pub_info = pd.DataFrame()

# Pick a range of pmids to test
for pmid in tqdm(test_pmids, ncols=80):
    # Use PubMed API to get publication data
    publication_info = gpub.get_pubmed_info_from_pmid(pmid)
    try:
        if publication_info:
        # Combine the information with the original DataFrame
            df_current = pd.DataFrame({
                'pmid': pmid,
                'title': publication_info['title'],
                'authors': publication_info['authors'],
                'publication_date': publication_info['publication_date']
            }, index=[0])

            # Add the current DataFrame to df_pub_info
            df_pub_info = pd.concat([df_pub_info, df_current], ignore_index=True)

    except Exception as e:
        print(f"Error processing PMID {pmid}: {e}")
        # Fill in fields with NaN if not available
        df_current = pd.DataFrame({
            'pmid': pmid,
            'title': pd.NA,
            'authors': pd.NA,
            'publication_date': pd.NA
        }, index=[0])

        # Add the current DataFrame to df_pub_info
        df_pub_info = pd.concat([df_pub_info, df_current], ignore_index=True)

df_pub_info

  0%|                                                     | 0/1 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.75it/s]


Unnamed: 0,pmid,title,authors,publication_date
0,30659131,"Cervical Cancer, Version 3.2019, NCCN Clinical...","Wui-Jin Koh, Nadeem R Abu-Rustum, Sarah Bean, ...",2019-01-01


In [5]:
# # Build final publication df
# df_publications, df_removed_publications = gpub.merge_and_clean_project_pmid_info(
#                                             df_pmids[['coreproject','pmid']], 
#                                             df_pub_info)


# Part 2: Fix unexpected year formatting
37796644 uses Medline formatting

In [6]:
gpub.get_pubmed_info_from_pmid(37796644)

{'publication_id': 37796644,
 'title': 'Evolution of Response-Based Radiotherapy for Hepatocellular Cancer.',
 'authors': 'Ameer L Elaimy, Yue Cao, Theodore S Lawrence',
 'publication_date': datetime.datetime(2023, 9, 1, 0, 0)}

In [7]:
gpub.get_pubmed_info_from_pmid(30659131)

{'publication_id': 30659131,
 'title': 'Cervical Cancer, Version 3.2019, NCCN Clinical Practice Guidelines in Oncology.',
 'authors': 'Wui-Jin Koh, Nadeem R Abu-Rustum, Sarah Bean, Kristin Bradley, Susana M Campos, Kathleen R Cho, Hye Sook Chon, Christina Chu, Rachel Clark, David Cohn, Marta Ann Crispens, Shari Damast, Oliver Dorigo, Patricia J Eifel, Christine M Fisher, Peter Frederick, David K Gaffney, Ernest Han, Warner K Huh, John R Lurain, Andrea Mariani, David Mutch, Christa Nagel, Larissa Nekhlyudov, Amanda Nickles Fader, Steven W Remmenga, R Kevin Reynolds, Todd Tillmanns, Stefanie Ueda, Emily Wyse, Catheryn M Yashar, Nicole R McMillian, Jillian L Scavone',
 'publication_date': datetime.datetime(2019, 1, 1, 0, 0)}

In [8]:
gpub.get_full_publication_record('37796644')['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate'].get('MedlineDate','')

'2023 Sep-Oct 01'

In [9]:
df_pubmed_errors = pd.read_csv('pubmed_gathering_errors_20231206.csv')
df_pubmed_errors.rename(columns={'PMID':'pmid'}, inplace=True)
df_pubmed_errors

Unnamed: 0,pmid,Error
0,12025811,list index out of range
1,15602803,list index out of range
2,21225868,list index out of range
3,22431807,list index out of range
4,23847722,list index out of range
...,...,...
178,37471612,Year
179,37796644,Year
180,37796646,Year
181,37963365,Year


In [10]:
# Create empty DataFrame
df_year_formats = pd.DataFrame(columns=['pmid', 'date'])

# Iterate through each unique PMID with year error
for pmid in df_pubmed_errors[df_pubmed_errors['Error'] == 'Year']['pmid'].unique():
  # Get the PubMed record from the API
  record = gpub.get_full_publication_record(pmid)

  # Pull out the MedlineDate and check if it's available
  medline_date = record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate'].get('MedlineDate', '')

  # Create a temporary DataFrame
  df_temp = pd.DataFrame({'pmid': [pmid], 'date': [medline_date]})

  # Concatenate the temporary DataFrame with the existing one
  df_year_formats = pd.concat([df_year_formats, df_temp], ignore_index=True)

# Return the final DataFrame
df_year_formats

Unnamed: 0,pmid,date
0,1921996,1991 May 29-Jun 12
1,2152373,1990-1991 Winter
2,2983224,1985 Feb 28-Mar 6
3,3024013,1986 Nov 27-Dec 3
4,3025740,1986 Dec 18-31
...,...,...
90,37471612,2023 Jul-Aug 01
91,37796644,2023 Sep-Oct 01
92,37796646,2023 Sep-Oct 01
93,37963365,2023 Nov-Dec 01


In [11]:
# Export as csv for reference and ad hoc use
df_year_formats.to_csv('publication_medline_dates_20231211.csv', index=False)

In [12]:
df_year_formats.date.tolist()

['1991 May 29-Jun 12',
 '1990-1991 Winter',
 '1985 Feb 28-Mar 6',
 '1986 Nov 27-Dec 3',
 '1986 Dec 18-31',
 '1987 Jan 1-7',
 '1988-1989',
 '1986 Jun 5-11',
 '1986 Dec 4-10',
 '1987 Jan 8-14',
 '1985 Jun 27-Jul 3',
 '1985 Jun 20-26',
 '1986 Feb 6-12',
 '1984 Nov 29-Dec 5',
 '1983-1984',
 '1981-1982',
 '1984 Aug 16-22',
 '1983 Jun 23-29',
 '1983 Nov 10-16',
 '1983 Dec 8-14',
 '1984 May 3-9',
 '1984-1985',
 '1983 Feb 17-23',
 '1983 Dec 22-1984 Jan 4',
 '1984 Aug 16-22',
 '1983 Dec 1-7',
 '1983-1984',
 '1984 May 10-16',
 '1983 Dec 23-30',
 '1984 Feb 23-29',
 '1984 Apr 5-11',
 '1994-1995',
 '1994-1995',
 '1995-1996',
 '1995-1996',
 '1998-1999',
 '1999 Jun 23-30',
 '1998-1999',
 '2000 Jul 27-Aug 10',
 '2002 Oct 23-30',
 '2003-2004',
 '2004 Sep 11-17',
 '2005 Jun 11-17',
 '2010-2011',
 '2011-2012',
 '2011 Dec 5-7',
 '2014 Feb 12-18',
 '2014 Jul 23-30',
 '2016 Mar 22-29',
 '2017 Winter',
 '2017 Winter',
 '2018 Fall',
 '2018 Fall',
 '2021 Mar-Apr 01',
 '2021 Jan-Feb 01',
 '2022 Jan-Feb 01',
 '2

### Write function to parse Medline date chaos into standard datetime

In [13]:
def extract_medline_date_components(date_str):
    """Extracts date components from a Medline date string with various formats.

    :param date_str: A string containing date information in the Medline format.
    :return: A formatted date string in the format 'yyyy-mm-dd' or None if an error occurs.
    """

    try:
        # Look for the first 4-digit number and store as the year
        year_match = re.search(r'\b\d{4}\b', date_str)
        if year_match:
            year = int(year_match.group())
        else:
            raise ValueError("No year found")

        # Find first month occurrence (Mmm) and store as the numerical month
        month_match = re.search(
                    r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b',
                    date_str, re.I)
        if month_match:
            month = datetime.strptime(month_match.group(), "%b").month
        else:
            # Look for season string (Winter, Spring, Summer, Fall) and store 
            # as the numerical month (12, 3, 6, 9).
            season_match = re.search(r'\b(?:Winter|Spring|Summer|Fall)\b',
                                      date_str, re.I)
            if season_match:
                season_to_month = {'Winter': 12, 'Spring': 3, 
                                   'Summer': 6, 'Fall': 9}
                month = season_to_month[season_match.group()]
            else:
                month = 1  # If no month or season, assign 1.

        # Look for first occurrence of 1 or 2 digits after space and store as day
        day_match_space = re.search(r'\b\d{1,2}\b', date_str)

        if day_match_space:
            day = int(day_match_space.group())
        else:
            day = 1  # If not identified, assign 1.

        return datetime(year, month, day).strftime('%Y-%m-%d')

    except Exception as e:
        print(f"Error processing date string '{date_str}': {e}")
        return None
    

    
def format_medline_publication_date(date_str_list):
    """
    Formats a list of Medline-formatted date strings into 'yyyy-mm-dd' format.

    :param date_str_list: A list of strings containing date information in one 
                of several Medline formats.
    :return: A list of formatted date strings or None for strings with errors.
    """
    formatted_dates = []
    for date_str in date_str_list:
        formatted_date = extract_medline_date_components(date_str)
        formatted_dates.append(formatted_date)
    return formatted_dates

In [14]:
format_medline_publication_date(df_year_formats['date'])

['1991-05-29',
 '1990-12-01',
 '1985-02-28',
 '1986-11-27',
 '1986-12-18',
 '1987-01-01',
 '1988-01-01',
 '1986-06-05',
 '1986-12-04',
 '1987-01-08',
 '1985-06-27',
 '1985-06-20',
 '1986-02-06',
 '1984-11-29',
 '1983-01-01',
 '1981-01-01',
 '1984-08-16',
 '1983-06-23',
 '1983-11-10',
 '1983-12-08',
 '1984-05-03',
 '1984-01-01',
 '1983-02-17',
 '1983-12-22',
 '1984-08-16',
 '1983-12-01',
 '1983-01-01',
 '1984-05-10',
 '1983-12-23',
 '1984-02-23',
 '1984-04-05',
 '1994-01-01',
 '1994-01-01',
 '1995-01-01',
 '1995-01-01',
 '1998-01-01',
 '1999-06-23',
 '1998-01-01',
 '2000-07-27',
 '2002-10-23',
 '2003-01-01',
 '2004-09-11',
 '2005-06-11',
 '2010-01-01',
 '2011-01-01',
 '2011-12-05',
 '2014-02-12',
 '2014-07-23',
 '2016-03-22',
 '2017-12-01',
 '2017-12-01',
 '2018-09-01',
 '2018-09-01',
 '2021-03-01',
 '2021-01-01',
 '2022-01-01',
 '2022-01-01',
 '2021-11-01',
 '2021-02-01',
 '2021-11-01',
 '2021-01-01',
 '2021-01-01',
 '2022-01-01',
 '2021-11-01',
 '2021-03-01',
 '2021-10-01',
 '2022-03-

In [15]:
df_year_formats['std_date'] = format_medline_publication_date(df_year_formats['date'])

In [16]:
df_year_formats.to_csv('publication_medline_dates_corrected_20231211.csv', index=False)

### Test after moving to `gather_publications_data.py`

In [17]:
df_pmid_info = pd.DataFrame()

for pmid in tqdm(df_year_formats['pmid']):
    publication_info = gpub.get_pubmed_info_from_pmid(pmid)

    df_current = pd.DataFrame({
    'pmid': pmid,
    'title': publication_info['title'],
    'authors': publication_info['authors'],
    'publication_date': publication_info['publication_date']
    }, index=[0])

    # Add the current DataFrame to df_pmid_info
    df_pmid_info = pd.concat([df_pmid_info, df_current], ignore_index=True)

df_pmid_info

100%|██████████| 95/95 [00:21<00:00,  4.33it/s]


Unnamed: 0,pmid,title,authors,publication_date
0,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",1991-05-29
1,2152373,The biological significance of the interaction...,"C S Murphy, V C Jordan",1990-12-01
2,2983224,Stable replication of plasmids derived from Ep...,"J L Yates, N Warren, B Sugden",1985-02-28
3,3024013,The v-fms oncogene induces factor independence...,"E F Wheeler, C W Rettenmier, A T Look, C J Sherr",1986-11-27
4,3025740,Predominant use of a V alpha gene segment in m...,"A Winoto, J L Urban, N C Lan, J Goverman, L Ho...",1986-12-18
...,...,...,...,...
90,37471612,Recent Advances in Blood-Based Liquid Biopsy A...,"Andi K Cani, Simpa S Salami",2023-07-01
91,37796644,Evolution of Response-Based Radiotherapy for H...,"Ameer L Elaimy, Yue Cao, Theodore S Lawrence",2023-09-01
92,37796646,Evaluation of the Prognostic Role of Liver Met...,"Jessica J Waninger, Vincent T Ma, Zoey Chopra,...",2023-09-01
93,37963365,From Race to Racism and Disparities to Equity:...,"Katherine Reeder-Hayes, Mya L Roberson, Stepha...",2023-11-01


Looks good. No errors thrown and spot-check of results are parsed correctly.

# Part 3: Add RCR and Citation Range fields
This work was started for [INS-819](https://tracker.nci.nih.gov/browse/INS-819) but later cancelled. These groupings are better handled on the frontend. 

In [3]:
# Load publications.tsv 
df_publicatons = pd.read_csv('../'+config.PUBLICATIONS_OUTPUT, sep='\t')
df_publicatons

Unnamed: 0,coreproject,pmid,title,authors,publication_date,citation_count,relative_citation_ratio
0,P30CA016058,10533315,Positive and negative signaling in B lymphocytes.,K M Coggeshall,2000-01-01,5.0,0.07
1,P30CA016056,10570225,Murine B cell differentiation is accompanied b...,"S A Wuensch, R Y Huang, J Ewing, X Liang, J T Lau",2000-01-01,189.0,4.91
2,R01CA076287,10585592,Synergistic cytotoxicity of iodine-131-anti-CD...,"T A Johnson, O W Press",2000-01-01,10.0,0.16
3,P30CA034196,10592196,Mouse tumor biology database (MTB): enhancemen...,"C J Bult, D M Krupke, J P Sundberg, J T Eppig",2000-01-01,10.0,0.28
4,P30CA042014,10594011,Auto-inhibition of Ets-1 is counteracted by DN...,"T L Goetz, T L Gu, N A Speck, B J Graves",2000-01-01,17.0,0.29
...,...,...,...,...,...,...,...
150695,R01CA251555,38036788,Isoform-level transcriptome-wide association u...,"Arjun Bhattacharya, Daniel D Vo, Connor Jops, ...",2023-01-30,,
150696,P30CA008748,38036927,Morbidity and Outcomes of Primary Tumor Manage...,"Aradhya Nigam, Janet W Y Li, Megan Fiasconaro,...",2023-01-30,,
150697,D43TW010540,38037005,The impact of the COVID-19 pandemic on routine...,"Magdiel A Habila, Mavis Obeng-Kusi, Maryam J A...",2023-01-30,,
150698,P30CA168524,38037383,Cancer Prevention Perspective: The University ...,"Roy A Jensen, Christie A Befort",2023-12-01,,


Define groups as dictionaries that can be defined in the config file

In [27]:
CITATION_CATEGORIES = {
    "0 to 4": [0, 4],
    "5 to 9": [5, 9],
    "10 to 14": [10, 14],
    "15 to 19": [15, 19],
    ">= 20": [20, None]
}

RCR_CATEGORIES = {
    "0 to 0.2": [0, 0.2],
    "0.2 to 0.5": [0.2, 0.5],
    "0.5 to 0.8": [0.5, 0.8],
    "0.8 to 1.25": [0.8, 1.25],
    "1.25 to 2": [1.25, 2],
    "2 to 5": [2, 5],
    "> 5": [5, None]
}

In [40]:
def categorize_column(value, ranges):
    for category, (low, high) in ranges.items():
        if (low is None or value >= low) and (high is None or value <= high):
            return category
    return None

def add_category_columns(df, citation_ranges, rcr_ranges):
    df['citation_count_category'] = df['citation_count'].apply(lambda x: categorize_column(x, citation_ranges))
    df['rcr_range'] = df['relative_citation_ratio'].apply(lambda x: categorize_column(x, rcr_ranges))
    return df

In [41]:
for value in [0, 1, 5, 10, 11, 15, 19, 19.5, 20, 21]:
    print(f"{value:<10} {categorize_column(value, CITATION_CATEGORIES)}")

0          0 to 4
1          0 to 4
5          5 to 9
10         10 to 14
11         10 to 14
15         15 to 19
19         15 to 19
19.5       None
20         >= 20
21         >= 20


In [43]:
for value in [0, 0.1, .1, 0.2, .5, .8, 1, 1.25, 2, 5, 6]:
    print(f"{value:<10} {categorize_column(value, RCR_CATEGORIES)}")

0          0 to 0.2
0.1        0 to 0.2
0.1        0 to 0.2
0.2        0 to 0.2
0.5        0.2 to 0.5
0.8        0.5 to 0.8
1          0.8 to 1.25
1.25       0.8 to 1.25
2          1.25 to 2
5          2 to 5
6          > 5
