# Compare PubMed API and iCite Publication Data
2023-12-01 ZD  

Comparison of data pulled from the Entrez PubMed API and the iCite bulk download for relvant INS PMIDs. This notebook will be used to determine the best method for gathering publication data for INS. 

Part 1: Comparing data  
Part 2: Combining data

In [1]:
# Method to import from parent directory
import os
import sys
root_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(root_dir)
import config

import requests
import pandas as pd
from tqdm import tqdm
import numpy as np

# Get all existing publication functions
import modules.gather_publication_data as gpub

---TIMESTAMP OVERRIDE IN USE---
---Disable this with comments in config.py for default behavior---


### Load icite and pubmed data from checkpoint files

In [2]:
df_icite = pd.read_csv('pmids_with_icite_20231201.csv')
df_icite

Unnamed: 0,pmid,doi,title,authors,year,citation_count,relative_citation_ratio
0,1279509,10.1203/00006450-199210000-00018,Expression and regulation of L-selectin on eos...,"J B Smith, R D Kunjummen, T K Kishimoto, D C A...",1992.0,25.0,0.67
1,1280555,10.1002/cyto.990130707,Streptavidin-based quantitative staining of in...,"P Srivastava, T L Sladek, M N Goodman, J W Jac...",1992.0,14.0,0.43
2,1281066,10.1002/cyto.990130808,"Reticulocyte quantification by flow cytometry,...","K J Schimenti, K Lacerna, A Wamble, L Maston, ...",1992.0,38.0,1.45
3,1282437,10.1101/gr.2.2.137,Development of a sensitive reverse transcripta...,"S S Tan, J H Weis",1992.0,49.0,1.00
4,1283327,10.1002/gcc.2870050414,Sublocalization of the chromosome 5 breakpoint...,"S W Morris, J T Foust, M B Valentine, W M Robe...",1992.0,14.0,0.29
...,...,...,...,...,...,...,...
144653,37947333,,,,,,
144654,37947334,,,,,,
144655,37947335,,,,,,
144656,37947337,,,,,,


In [3]:
df_pubmed = gpub.load_all_directory_files_to_df('../'+config.TEMP_PUBLICATION_DIR)
df_pubmed

Unnamed: 0,pmid,title,authors,publication_year
0,1279509,Expression and regulation of L-selectin on eos...,"J B Smith, R D Kunjummen, T K Kishimoto, D C A...",1992.0
1,1280555,Streptavidin-based quantitative staining of in...,"P Srivastava, T L Sladek, M N Goodman, J W Jac...",1992.0
2,1281066,"Reticulocyte quantification by flow cytometry,...","K J Schimenti, K Lacerna, A Wamble, L Maston, ...",1992.0
3,1281564,Bleomycin-detectable iron in plasma of bone-ma...,"C A Foerder, A A Tobin, G B McDonald, R A Zager",1992.0
4,1282437,Development of a sensitive reverse transcripta...,"S S Tan, J H Weis",1992.0
...,...,...,...,...
145403,37968277,PAX3-FOXO1 dictates myogenic reprogramming and...,"Madeline B Searcy, Randolph K Larsen, Bradley ...",2023.0
145404,37971219,Cognitive function and ability to complete a w...,"Alexxandra J Hoffmann, Amy L Tin, Andrew J Vic...",2023.0
145405,37971305,"Cistrome Data Browser: integrated search, anal...","Len Taing, Ariaki Dandawate, Sehi L'Yi, Nils G...",2023.0
145406,37973913,SND1 binds to ERG and promotes tumor growth in...,"Sheng-You Liao, Dmytro Rudoy, Sander B Frank, ...",2023.0


In [24]:
# Check for duplicate pmids
df_pubmed.groupby('pmid').size().reset_index().sort_values(by=0,ascending=False)

Unnamed: 0,pmid,0
0,1279509,1
96941,30137196,1
96935,30136444,1
96936,30136582,1
96937,30136838,1
...,...,...
48463,23048078,1
48462,23047847,1
48461,23047803,1
48460,23047649,1


In [33]:
# Get rows where ALL pubmed-specific values are NaN
df_pubmed[df_pubmed[['title', 'authors', 'publication_year']].isna().all(axis=1)]

Unnamed: 0,pmid,title,authors,publication_year


In [34]:
# Get rows where ANY pubmed-specific values are NaN
df_pubmed[df_pubmed[['title', 'authors', 'publication_year']].isna().any(axis=1)]

Unnamed: 0,pmid,title,authors,publication_year
1469,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",
1569,1975507,Effect of corticosteroid creams on descent of ...,,1990.0
1926,2152373,The biological significance of the interaction...,"C S Murphy, V C Jordan",
3412,2743835,Suppression of tumorigenicity by polar compoun...,,1989.0
3745,2865921,Treatment of blastomycosis and histoplasmosis ...,,1985.0
...,...,...,...,...
144644,37735267,Decoding the building blocks of cellular proce...,,2023.0
144716,37748199,"Correction to: Morphologic, immunophenotypic, ...",,2023.0
144785,37766422,Correction to: Ten-year update: NRG Oncology/N...,,2023.0
144917,37796644,Evolution of Response-Based Radiotherapy for H...,"Ameer L Elaimy, Yue Cao, Theodore S Lawrence",


In [35]:
# First filter out pmids from larger pubmed_df, then filter out pmids from smaller icite df
df_pubmed = df_pubmed[df_pubmed['pmid'].isin(df_icite['pmid'].unique())]
df_icite = df_icite[df_icite['pmid'].isin(df_pubmed['pmid'].unique())]

In [36]:
df_pubmed

Unnamed: 0,pmid,title,authors,publication_year
0,1279509,Expression and regulation of L-selectin on eos...,"J B Smith, R D Kunjummen, T K Kishimoto, D C A...",1992.0
1,1280555,Streptavidin-based quantitative staining of in...,"P Srivastava, T L Sladek, M N Goodman, J W Jac...",1992.0
2,1281066,"Reticulocyte quantification by flow cytometry,...","K J Schimenti, K Lacerna, A Wamble, L Maston, ...",1992.0
4,1282437,Development of a sensitive reverse transcripta...,"S S Tan, J H Weis",1992.0
5,1283327,Sublocalization of the chromosome 5 breakpoint...,"S W Morris, J T Foust, M B Valentine, W M Robe...",1992.0
...,...,...,...,...
145364,37947333,Adapting a model of cervical carcinogenesis to...,"Jennifer C Spencer, Emily A Burger, Nicole G C...",2023.0
145365,37947334,"Contribution of smoking, disease history, and ...","Sarah Skolnick, Pianpian Cao, Jihyoun Jeon, Ra...",2023.0
145366,37947335,Data gaps and opportunities for modeling cance...,"Amy Trentham-Dietz, Douglas A Corley, Natalie ...",2023.0
145367,37947337,Population simulation modeling of disparities ...,"Jeanne S Mandelblatt, Clyde B Schechter, Natas...",2023.0


In [37]:
df_icite

Unnamed: 0,pmid,doi,title,authors,year,citation_count,relative_citation_ratio
0,1279509,10.1203/00006450-199210000-00018,Expression and regulation of L-selectin on eos...,"J B Smith, R D Kunjummen, T K Kishimoto, D C A...",1992.0,25.0,0.67
1,1280555,10.1002/cyto.990130707,Streptavidin-based quantitative staining of in...,"P Srivastava, T L Sladek, M N Goodman, J W Jac...",1992.0,14.0,0.43
2,1281066,10.1002/cyto.990130808,"Reticulocyte quantification by flow cytometry,...","K J Schimenti, K Lacerna, A Wamble, L Maston, ...",1992.0,38.0,1.45
3,1282437,10.1101/gr.2.2.137,Development of a sensitive reverse transcripta...,"S S Tan, J H Weis",1992.0,49.0,1.00
4,1283327,10.1002/gcc.2870050414,Sublocalization of the chromosome 5 breakpoint...,"S W Morris, J T Foust, M B Valentine, W M Robe...",1992.0,14.0,0.29
...,...,...,...,...,...,...,...
144653,37947333,,,,,,
144654,37947334,,,,,,
144655,37947335,,,,,,
144656,37947337,,,,,,


In [38]:
len(df_icite) == len(df_pubmed)

True

In [40]:
print(f"PubMed columns: {df_pubmed.columns.tolist()}")
print(f"iCite columns:  {df_icite.columns.tolist()}")

PubMed columns: ['pmid', 'title', 'authors', 'publication_year']
iCite columns:  ['pmid', 'doi', 'title', 'authors', 'year', 'citation_count', 'relative_citation_ratio']


In [42]:
df_pubmed = df_pubmed.rename(columns={'publication_year':'year'})

In [44]:
main_cols = ['pmid', 'title','authors','year']

In [45]:
pd.concat([df_pubmed[main_cols],df_icite[main_cols]]).drop_duplicates(keep=False)

Unnamed: 0,pmid,title,authors,year
1011,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0
1469,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",
1562,1972651,Partial Xq25 deletion in a family with the X-l...,"W G Sanger, H L Grierson, J Skare, H Wyandt, S...",1990.0
1926,2152373,The biological significance of the interaction...,"C S Murphy, V C Jordan",
2237,2276277,Partial amino acid sequence determination of b...,"D L Cooper, E W Baptist, J Enghild, H Lee, N I...",1990.0
...,...,...,...,...
144653,37947333,,,
144654,37947334,,,
144655,37947335,,,
144656,37947337,,,


In [72]:
df_diffs = (df_pubmed[main_cols].merge(df_icite[main_cols], 
                                       indicator = 'source', how='outer')
                                       .loc[lambda x : x['source']!='both']
                                       .sort_values(by='pmid', ignore_index=True))

In [73]:
df_diffs

Unnamed: 0,pmid,title,authors,year,source
0,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0,left_only
1,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0,right_only
2,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",,left_only
3,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",1991.0,right_only
4,1972651,Partial Xq25 deletion in a family with the X-l...,"W G Sanger, H L Grierson, J Skare, H Wyandt, S...",1990.0,left_only
...,...,...,...,...,...
6233,37947335,Data gaps and opportunities for modeling cance...,"Amy Trentham-Dietz, Douglas A Corley, Natalie ...",2023.0,left_only
6234,37947337,,,,right_only
6235,37947337,Population simulation modeling of disparities ...,"Jeanne S Mandelblatt, Clyde B Schechter, Natas...",2023.0,left_only
6236,37947614,Machine Learning Allows for Distinguishing Pre...,"Mikhail Petrov, Igor Sokolov",2023.0,left_only


In [74]:
df_diffs = df_diffs.replace({'left_only':'PubMed','right_only':'iCite'})
df_diffs

Unnamed: 0,pmid,title,authors,year,source
0,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0,PubMed
1,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0,iCite
2,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",,PubMed
3,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",1991.0,iCite
4,1972651,Partial Xq25 deletion in a family with the X-l...,"W G Sanger, H L Grierson, J Skare, H Wyandt, S...",1990.0,PubMed
...,...,...,...,...,...
6233,37947335,Data gaps and opportunities for modeling cance...,"Amy Trentham-Dietz, Douglas A Corley, Natalie ...",2023.0,PubMed
6234,37947337,,,,iCite
6235,37947337,Population simulation modeling of disparities ...,"Jeanne S Mandelblatt, Clyde B Schechter, Natas...",2023.0,PubMed
6236,37947614,Machine Learning Allows for Distinguishing Pre...,"Mikhail Petrov, Igor Sokolov",2023.0,PubMed


In [75]:
df_diff_pubmed = df_diffs[df_diffs['source'] == 'PubMed'].drop(columns='source')
df_diff_pubmed

Unnamed: 0,pmid,title,authors,year
0,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0
2,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",
4,1972651,Partial Xq25 deletion in a family with the X-l...,"W G Sanger, H L Grierson, J Skare, H Wyandt, S...",1990.0
6,2152373,The biological significance of the interaction...,"C S Murphy, V C Jordan",
8,2276277,Partial amino acid sequence determination of b...,"D L Cooper, E W Baptist, J Enghild, H Lee, N I...",1990.0
...,...,...,...,...
6229,37947333,Adapting a model of cervical carcinogenesis to...,"Jennifer C Spencer, Emily A Burger, Nicole G C...",2023.0
6231,37947334,"Contribution of smoking, disease history, and ...","Sarah Skolnick, Pianpian Cao, Jihyoun Jeon, Ra...",2023.0
6233,37947335,Data gaps and opportunities for modeling cance...,"Amy Trentham-Dietz, Douglas A Corley, Natalie ...",2023.0
6235,37947337,Population simulation modeling of disparities ...,"Jeanne S Mandelblatt, Clyde B Schechter, Natas...",2023.0


In [76]:
df_diff_icite = df_diffs[df_diffs['source'] == 'iCite'].drop(columns='source')
df_diff_icite

Unnamed: 0,pmid,title,authors,year
1,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0
3,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",1991.0
5,1972651,Partial Xq25 deletion in a family with the X-l...,"W G Sanger, H L Grierson, J Skare, H Wyandt, S...",1990.0
7,2152373,The biological significance of the interaction...,"C S Murphy, V C Jordan",1990.0
9,2276277,Partial amino acid sequence determination of b...,"D L Cooper, E W Baptist, J Enghild, H Lee, N I...",1990.0
...,...,...,...,...
6228,37947333,,,
6230,37947334,,,
6232,37947335,,,
6234,37947337,,,


In [68]:
df_diff_pubmed.to_csv('data_diff_pubmed_20231201.csv', index=False)
df_diff_icite.to_csv('data_diff_icite_20231201.csv', index=False)
df_diffs.to_csv('data_diff_combined_20231201.csv', index=False)

In [77]:
df_diffs

Unnamed: 0,pmid,title,authors,year,source
0,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0,PubMed
1,1702006,Prognostic importance of immunophenotyping in ...,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",1990.0,iCite
2,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",,PubMed
3,1921996,Termination of transcription of ribosomal RNA ...,"S P Johnson, J R Warner",1991.0,iCite
4,1972651,Partial Xq25 deletion in a family with the X-l...,"W G Sanger, H L Grierson, J Skare, H Wyandt, S...",1990.0,PubMed
...,...,...,...,...,...
6233,37947335,Data gaps and opportunities for modeling cance...,"Amy Trentham-Dietz, Douglas A Corley, Natalie ...",2023.0,PubMed
6234,37947337,,,,iCite
6235,37947337,Population simulation modeling of disparities ...,"Jeanne S Mandelblatt, Clyde B Schechter, Natas...",2023.0,PubMed
6236,37947614,Machine Learning Allows for Distinguishing Pre...,"Mikhail Petrov, Igor Sokolov",2023.0,PubMed


Format for human review and comparison of differences in Excel

In [111]:
# Create a MultiIndex to group by PMID and Source
df_grouped = df_diffs.set_index(['pmid', 'source'])

# Pivot the DataFrame to create separate columns for PubMed and iCite data
df_pivot = df_grouped.pivot_table(index='pmid', columns='source', values=['title', 'authors', 'year'], aggfunc='first')

# Fill NaN with an empty string for better visibility
df_pivot.fillna('', inplace=True)

# Reset the index for easier human readability
df_pivot.reset_index(inplace=True)

# Rename columns
df_pivot.columns = [f'{col}_{source}' if col != 'pmid' else col for col, source in df_pivot.columns]

# Add columns for indicating differences
for col in ['authors', 'title', 'year']:
    col_pubmed = f'{col}_PubMed'
    col_icite = f'{col}_iCite'
    col_diff = f'diff_{col}'

    df_pivot[col_diff] = ''
    
    # Check for missing values
    mask_missing_pubmed = df_pivot[col_pubmed].eq('')
    mask_missing_icite = df_pivot[col_icite].eq('')
    df_pivot.loc[mask_missing_pubmed, col_diff] = 'PubMed_missing_value'
    df_pivot.loc[mask_missing_icite, col_diff] = 'iCite_missing_value'

    # Check for mismatched values
    mask_mismatch = (df_pivot[col_pubmed] != df_pivot[col_icite]) & (~mask_missing_pubmed) & (~mask_missing_icite)
    df_pivot.loc[mask_mismatch, col_diff] = f'mismatch_{col}_values'

    # Add columns for indicating differences
for col in ['authors', 'title', 'year']:
    col_pubmed = f'{col}_PubMed'
    col_icite = f'{col}_iCite'
    col_diff = f'diff_{col}'

    df_pivot[col_diff] = ''
    
    # Check for missing values
    mask_missing_pubmed = df_pivot[col_pubmed].eq('')
    mask_missing_icite = df_pivot[col_icite].eq('')
    df_pivot.loc[mask_missing_pubmed, col_diff] = 'PubMed_missing_value'
    df_pivot.loc[mask_missing_icite, col_diff] = 'iCite_missing_value'

    # Check for mismatched values
    mask_mismatch = (df_pivot[col_pubmed] != df_pivot[col_icite]) & (~mask_missing_pubmed) & (~mask_missing_icite)
    df_pivot.loc[mask_mismatch, col_diff] = f'mismatch_{col}_values'

    # Check for formatting mismatches
    mask_whitespace_pubmed = df_pivot[col_pubmed].str.strip() != df_pivot[col_pubmed]
    mask_whitespace_icite = df_pivot[col_icite].str.strip() != df_pivot[col_icite]
    mask_trailing_comma_pubmed = df_pivot[col_pubmed].str.endswith(',')
    mask_trailing_comma_icite = df_pivot[col_icite].str.endswith(',')
    mask_trailing_period_pubmed = df_pivot[col_pubmed].str.endswith('.')
    mask_trailing_period_icite = df_pivot[col_icite].str.endswith('.')

    # Identify formatting mismatches
    mask_formatting_mismatch = (
        (col != 'year' and (mask_whitespace_pubmed | mask_whitespace_icite | mask_trailing_comma_pubmed | mask_trailing_comma_icite | mask_trailing_period_pubmed | mask_trailing_period_icite))
        & (~mask_missing_pubmed) & (~mask_missing_icite) & (~mask_mismatch)
    )
    df_pivot.loc[mask_formatting_mismatch, col_diff] = f'formatting_mismatch_{col}_values'



  df_pivot.fillna('', inplace=True)


In [108]:
# # Add columns for indicating differences
# for col in ['authors', 'title', 'year']:
#     col_pubmed = f'{col}_PubMed'
#     col_icite = f'{col}_iCite'
#     col_diff = f'diff_{col}'

#     df_pivot[col_diff] = ''
    
#     # Check for missing values
#     mask_missing_pubmed = df_pivot[col_pubmed].eq('')
#     mask_missing_icite = df_pivot[col_icite].eq('')
#     df_pivot.loc[mask_missing_pubmed, col_diff] = 'PubMed_missing_value'
#     df_pivot.loc[mask_missing_icite, col_diff] = 'iCite_missing_value'

#     # Check for mismatched values
#     mask_mismatch = (df_pivot[col_pubmed] != df_pivot[col_icite]) & (~mask_missing_pubmed) & (~mask_missing_icite)
#     df_pivot.loc[mask_mismatch, col_diff] = f'mismatch_{col}_values'

#     # Check for formatting mismatches
#     mask_whitespace_pubmed = df_pivot[col_pubmed].str.strip() != df_pivot[col_pubmed]
#     mask_whitespace_icite = df_pivot[col_icite].str.strip() != df_pivot[col_icite]
#     mask_trailing_comma_pubmed = df_pivot[col_pubmed].str.endswith(',')
#     mask_trailing_comma_icite = df_pivot[col_icite].str.endswith(',')
#     mask_trailing_period_pubmed = df_pivot[col_pubmed].str.endswith('.')
#     mask_trailing_period_icite = df_pivot[col_icite].str.endswith('.')

#     # Identify formatting mismatches
#     mask_formatting_mismatch = (
#         (col != 'year' and (mask_whitespace_pubmed | mask_whitespace_icite | mask_trailing_comma_pubmed | mask_trailing_comma_icite | mask_trailing_period_pubmed | mask_trailing_period_icite))
#         & (~mask_missing_pubmed) & (~mask_missing_icite) & (~mask_mismatch)
#     )
#     df_pivot.loc[mask_formatting_mismatch, col_diff] = f'formatting_mismatch_{col}_values'



In [112]:
df_pivot

Unnamed: 0,pmid,authors_PubMed,authors_iCite,title_PubMed,title_iCite,year_PubMed,year_iCite,diff_authors,diff_title,diff_year
0,1702006,"R B Geller, M Zahurak, C A Hurwitz, P J Burke,...","R B Geller, M Zahurak, C A Hurwitz, P J Burke,...",Prognostic importance of immunophenotyping in ...,Prognostic importance of immunophenotyping in ...,1990.0,1990.0,,mismatch_title_values,
1,1921996,"S P Johnson, J R Warner","S P Johnson, J R Warner",Termination of transcription of ribosomal RNA ...,Termination of transcription of ribosomal RNA ...,,1991.0,,formatting_mismatch_title_values,PubMed_missing_value
2,1972651,"W G Sanger, H L Grierson, J Skare, H Wyandt, S...","W G Sanger, H L Grierson, J Skare, H Wyandt, S...",Partial Xq25 deletion in a family with the X-l...,Partial Xq25 deletion in a family with the X-l...,1990.0,1990.0,,mismatch_title_values,
3,2152373,"C S Murphy, V C Jordan","C S Murphy, V C Jordan",The biological significance of the interaction...,The biological significance of the interaction...,,1990.0,,formatting_mismatch_title_values,PubMed_missing_value
4,2276277,"D L Cooper, E W Baptist, J Enghild, H Lee, N I...","D L Cooper, E W Baptist, J Enghild, H Lee, N I...",Partial amino acid sequence determination of b...,Partial amino acid sequence determination of b...,1990.0,1990.0,,mismatch_title_values,
...,...,...,...,...,...,...,...,...,...,...
3114,37947333,"Jennifer C Spencer, Emily A Burger, Nicole G C...",,Adapting a model of cervical carcinogenesis to...,,2023.0,,iCite_missing_value,iCite_missing_value,iCite_missing_value
3115,37947334,"Sarah Skolnick, Pianpian Cao, Jihyoun Jeon, Ra...",,"Contribution of smoking, disease history, and ...",,2023.0,,iCite_missing_value,iCite_missing_value,iCite_missing_value
3116,37947335,"Amy Trentham-Dietz, Douglas A Corley, Natalie ...",,Data gaps and opportunities for modeling cance...,,2023.0,,iCite_missing_value,iCite_missing_value,iCite_missing_value
3117,37947337,"Jeanne S Mandelblatt, Clyde B Schechter, Natas...",,Population simulation modeling of disparities ...,,2023.0,,iCite_missing_value,iCite_missing_value,iCite_missing_value


In [113]:
# Save the DataFrame to an Excel file for ad-hoc review
df_pivot.to_csv('publication_data_diffs_20231201.csv', index=False)

# Part 2: Use PubMed as the base and fill missing values with iCite

In [17]:
df_icite = pd.read_csv('pmids_with_icite_20231201.csv')

# Drop doi, it's not actually part of the requirements
df_icite.drop(columns='doi', inplace=True)

df_icite

Unnamed: 0,pmid,title,authors,year,citation_count,relative_citation_ratio
0,1279509,Expression and regulation of L-selectin on eos...,"J B Smith, R D Kunjummen, T K Kishimoto, D C A...",1992.0,25.0,0.67
1,1280555,Streptavidin-based quantitative staining of in...,"P Srivastava, T L Sladek, M N Goodman, J W Jac...",1992.0,14.0,0.43
2,1281066,"Reticulocyte quantification by flow cytometry,...","K J Schimenti, K Lacerna, A Wamble, L Maston, ...",1992.0,38.0,1.45
3,1282437,Development of a sensitive reverse transcripta...,"S S Tan, J H Weis",1992.0,49.0,1.00
4,1283327,Sublocalization of the chromosome 5 breakpoint...,"S W Morris, J T Foust, M B Valentine, W M Robe...",1992.0,14.0,0.29
...,...,...,...,...,...,...
144653,37947333,,,,,
144654,37947334,,,,,
144655,37947335,,,,,
144656,37947337,,,,,


In [18]:
df_pubmed = gpub.load_all_directory_files_to_df('../'+config.TEMP_PUBLICATION_DIR)
df_pubmed

Unnamed: 0,pmid,title,authors,publication_year
0,1279509,Expression and regulation of L-selectin on eos...,"J B Smith, R D Kunjummen, T K Kishimoto, D C A...",1992.0
1,1280555,Streptavidin-based quantitative staining of in...,"P Srivastava, T L Sladek, M N Goodman, J W Jac...",1992.0
2,1281066,"Reticulocyte quantification by flow cytometry,...","K J Schimenti, K Lacerna, A Wamble, L Maston, ...",1992.0
3,1281564,Bleomycin-detectable iron in plasma of bone-ma...,"C A Foerder, A A Tobin, G B McDonald, R A Zager",1992.0
4,1282437,Development of a sensitive reverse transcripta...,"S S Tan, J H Weis",1992.0
...,...,...,...,...
145403,37968277,PAX3-FOXO1 dictates myogenic reprogramming and...,"Madeline B Searcy, Randolph K Larsen, Bradley ...",2023.0
145404,37971219,Cognitive function and ability to complete a w...,"Alexxandra J Hoffmann, Amy L Tin, Andrew J Vic...",2023.0
145405,37971305,"Cistrome Data Browser: integrated search, anal...","Len Taing, Ariaki Dandawate, Sehi L'Yi, Nils G...",2023.0
145406,37973913,SND1 binds to ERG and promotes tumor growth in...,"Sheng-You Liao, Dmytro Rudoy, Sander B Frank, ...",2023.0


In [19]:
# Compare columns and types
print(df_pubmed.dtypes)
print(df_icite.dtypes)

pmid                  int64
title                object
authors              object
publication_year    float64
dtype: object
pmid                         int64
title                       object
authors                     object
year                       float64
citation_count             float64
relative_citation_ratio    float64
dtype: object


In [20]:
# Rename column name for consistency
df_icite.rename(columns={'year':'publication_year'}, inplace=True)

In [23]:
# Use the built-in combine_first to fill NaN pubmed with iCite where possible
df_combined = df_pubmed.combine_first(df_icite)
df_combined

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
0,"J B Smith, R D Kunjummen, T K Kishimoto, D C A...",25.0,1279509,1992.0,0.67,Expression and regulation of L-selectin on eos...
1,"P Srivastava, T L Sladek, M N Goodman, J W Jac...",14.0,1280555,1992.0,0.43,Streptavidin-based quantitative staining of in...
2,"K J Schimenti, K Lacerna, A Wamble, L Maston, ...",38.0,1281066,1992.0,1.45,"Reticulocyte quantification by flow cytometry,..."
3,"C A Foerder, A A Tobin, G B McDonald, R A Zager",49.0,1281564,1992.0,1.00,Bleomycin-detectable iron in plasma of bone-ma...
4,"S S Tan, J H Weis",14.0,1282437,1992.0,0.29,Development of a sensitive reverse transcripta...
...,...,...,...,...,...,...
145403,"Madeline B Searcy, Randolph K Larsen, Bradley ...",,37968277,2023.0,,PAX3-FOXO1 dictates myogenic reprogramming and...
145404,"Alexxandra J Hoffmann, Amy L Tin, Andrew J Vic...",,37971219,2023.0,,Cognitive function and ability to complete a w...
145405,"Len Taing, Ariaki Dandawate, Sehi L'Yi, Nils G...",,37971305,2023.0,,"Cistrome Data Browser: integrated search, anal..."
145406,"Sheng-You Liao, Dmytro Rudoy, Sander B Frank, ...",,37973913,2023.0,,SND1 binds to ERG and promotes tumor growth in...


### Validate combined results. Which NaN values remain? 

In [26]:
df_combined[df_combined['authors'].isna()]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
144644,,,37735267,2023.0,,Decoding the building blocks of cellular proce...
144716,,,37748199,2023.0,,"Correction to: Morphologic, immunophenotypic, ..."
144785,,,37766422,2023.0,,Correction to: Ten-year update: NRG Oncology/N...


In [45]:
# Check for short string values
df_combined[df_combined['authors'].str.len() <5]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
15994,C Ip,87.0,9808633,1998.0,2.13,Lessons from basic research in selenium and ca...
87034,",",17.0,28622513,2017.0,0.86,Comprehensive and Integrative Genomic Characte...
88287,",",42.0,28810144,2017.0,1.44,Integrated Genomic Characterization of Pancrea...
90128,",",385.0,29100075,2017.0,14.41,Comprehensive and Integrated Genomic Character...
117143,",",5.0,33176160,2020.0,0.83,A Systematic Framework to Rapidly Obtain Data ...


4 publications show a comma "," as the author. These all have a group or consortium instead of a name listed as the author when searched on PubMed.

In [27]:
df_combined[df_combined['title'].isna()]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title


In [46]:
# Check for short string values
df_combined[df_combined['title'].str.len() <5]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title


In [28]:
df_combined[df_combined['publication_year'].isna()]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
144917,"Ameer L Elaimy, Yue Cao, Theodore S Lawrence",,37796644,,,Evolution of Response-Based Radiotherapy for H...
144918,"Jessica J Waninger, Vincent T Ma, Zoey Chopra,...",,37796646,,,Evaluation of the Prognostic Role of Liver Met...


In [54]:
# Check for short string values
df_combined[df_combined['publication_year'] <1975]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title


In [29]:
df_combined[df_combined['relative_citation_ratio'].isna()]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
20583,"J Yao, S Xiong, K Klos, N Nguyen, R Grijalva, ...",,11781819,2001.0,,Multiple signaling pathways involved in activa...
24700,"Yago Nieto, Roy B Jones, Elizabeth J Shpall",,15368078,2004.0,,Stem-cell transplantation for the treatment of...
39047,"Jiang Gui, Angeline S Andrew, Peter Andrews, H...",,20924193,2010.0,,A simple and computationally efficient samplin...
52993,"Xiao-Jian Sun, Zhanxin Wang, Lan Wang, Yanwen ...",,23812588,2013.0,,A stable transcription factor complex nucleate...
53902,"Jasmine H Francis, Christopher A Barker, Suzan...",,23953635,2013.0,,Salvage/adjuvant brachytherapy after ophthalmi...
...,...,...,...,...,...,...
145403,"Madeline B Searcy, Randolph K Larsen, Bradley ...",,37968277,2023.0,,PAX3-FOXO1 dictates myogenic reprogramming and...
145404,"Alexxandra J Hoffmann, Amy L Tin, Andrew J Vic...",,37971219,2023.0,,Cognitive function and ability to complete a w...
145405,"Len Taing, Ariaki Dandawate, Sehi L'Yi, Nils G...",,37971305,2023.0,,"Cistrome Data Browser: integrated search, anal..."
145406,"Sheng-You Liao, Dmytro Rudoy, Sander B Frank, ...",,37973913,2023.0,,SND1 binds to ERG and promotes tumor growth in...


In [31]:
df_combined[df_combined['citation_count'].isna()]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
20583,"J Yao, S Xiong, K Klos, N Nguyen, R Grijalva, ...",,11781819,2001.0,,Multiple signaling pathways involved in activa...
24700,"Yago Nieto, Roy B Jones, Elizabeth J Shpall",,15368078,2004.0,,Stem-cell transplantation for the treatment of...
39047,"Jiang Gui, Angeline S Andrew, Peter Andrews, H...",,20924193,2010.0,,A simple and computationally efficient samplin...
52993,"Xiao-Jian Sun, Zhanxin Wang, Lan Wang, Yanwen ...",,23812588,2013.0,,A stable transcription factor complex nucleate...
53902,"Jasmine H Francis, Christopher A Barker, Suzan...",,23953635,2013.0,,Salvage/adjuvant brachytherapy after ophthalmi...
...,...,...,...,...,...,...
145403,"Madeline B Searcy, Randolph K Larsen, Bradley ...",,37968277,2023.0,,PAX3-FOXO1 dictates myogenic reprogramming and...
145404,"Alexxandra J Hoffmann, Amy L Tin, Andrew J Vic...",,37971219,2023.0,,Cognitive function and ability to complete a w...
145405,"Len Taing, Ariaki Dandawate, Sehi L'Yi, Nils G...",,37971305,2023.0,,"Cistrome Data Browser: integrated search, anal..."
145406,"Sheng-You Liao, Dmytro Rudoy, Sander B Frank, ...",,37973913,2023.0,,SND1 binds to ERG and promotes tumor growth in...


In [34]:
# Get rows in combined publication data where ANY values are NaN
df_combined[df_combined[['title', 'authors', 'publication_year', 'citation_count','relative_citation_ratio']].isna().any(axis=1)]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
20583,"J Yao, S Xiong, K Klos, N Nguyen, R Grijalva, ...",,11781819,2001.0,,Multiple signaling pathways involved in activa...
24700,"Yago Nieto, Roy B Jones, Elizabeth J Shpall",,15368078,2004.0,,Stem-cell transplantation for the treatment of...
39047,"Jiang Gui, Angeline S Andrew, Peter Andrews, H...",,20924193,2010.0,,A simple and computationally efficient samplin...
52993,"Xiao-Jian Sun, Zhanxin Wang, Lan Wang, Yanwen ...",,23812588,2013.0,,A stable transcription factor complex nucleate...
53902,"Jasmine H Francis, Christopher A Barker, Suzan...",,23953635,2013.0,,Salvage/adjuvant brachytherapy after ophthalmi...
...,...,...,...,...,...,...
145403,"Madeline B Searcy, Randolph K Larsen, Bradley ...",,37968277,2023.0,,PAX3-FOXO1 dictates myogenic reprogramming and...
145404,"Alexxandra J Hoffmann, Amy L Tin, Andrew J Vic...",,37971219,2023.0,,Cognitive function and ability to complete a w...
145405,"Len Taing, Ariaki Dandawate, Sehi L'Yi, Nils G...",,37971305,2023.0,,"Cistrome Data Browser: integrated search, anal..."
145406,"Sheng-You Liao, Dmytro Rudoy, Sander B Frank, ...",,37973913,2023.0,,SND1 binds to ERG and promotes tumor growth in...


In [37]:
# Get rows in combined publication data where ALL values are NaN
df_combined[df_combined[['title', 'authors', 'publication_year', 'citation_count','relative_citation_ratio']].isna().all(axis=1)]

Unnamed: 0,authors,citation_count,pmid,publication_year,relative_citation_ratio,title
