In [14]:
import sys
import inspect
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils import get_column_letter
import os
import json

# Add a directory to the Python path
sys.path.append("/Users/dan/Code/Python/pub_worm")

from pub_worm.ncbi.entreze_api import EntrezAPI

# Find where EntrezAPI is being load from
module = inspect.getmodule(EntrezAPI)
if hasattr(module, "__file__"):
    file_path = module.__file__
    print("EntrezAPI imported from:", file_path)
else:
    print("Could not determine the file path.")

EntrezAPI imported from: /Users/dan/Code/Python/pub_worm/pub_worm/ncbi/entreze_api.py


In [2]:
speakers = {"Charlie_Serhan"     :"Charles N Serhan[au] AND (2019/01/01:2024/04/16[pdat]) AND (harvard[affil])",
            "Lawrence_Marnett"   :"Marnett L[au] AND (2019/01/01:2024/04/16[pdat])AND (vanderbilt[affil])",
            "Sarah_Fendt"        :"Fendt S[au] AND (2019/01/01:2024/04/16[pdat])",
            "Meng_Wang"          :"Meng C Wang[au] AND (2019/01/01:2024/04/16[pdat]) AND (janelia[affil] OR Baylor[affil])",
            "Jenny_Watts"        :"Jennifer L Watts [au] AND (2019/01/01:2024/04/16[pdat])",
            "Valerie_Kagan"      :"Kagan VE[au] AND (2019/01/01:2024/04/16[pdat])AND (pitt[affil])",
            "Shirin_Bahmanyar"   :"Shirin Bahmanyar[au] AND (2019/01/01:2024/04/16[pdat])",
            "Arun_Radhakrishnan" :"Arun Radhakrishnan[au] AND University of Texas Southwestern[affil] AND(2019/01/01:2024/04/16[pdat])",
            "Todd_Graham"        :"Graham TR[au] AND (2019/01/01:2024/04/16[pdat]) AND (vanderbilt[affil])",
            "Chris_Burd"         :"Burd CG[au] AND (2019/01/01:2024/04/16[pdat])) AND (yale[affil])",
            "Jeeyun_Chung"       :"Jeeyun Chung[au] AND (2019/01/01:2024/04/16[pdat]) AND (harvard[affil])",
            "Hanaa_Hariri"       :"Hanaa Hariri[au] AND (2019/01/01:2024/04/16[pdat])",
            "Fikadu_Tafesse"     :"Tafesse FG[au] AND (2019/01/01:2024/04/16[pdat])",
            "Roberto_Zoncu"      :"Zoncu R[au] AND (2019/01/01:2024/04/16[pdat])",
            "Kuang_Shen"         :"Kuang Shen[au] AND (2019/01/01:2024/04/16[pdat])",
            "Alison_Ondrus"      :"Alison E Ondrus[au] AND (2019/01/01:2024/04/16[pdat])",
            "Squire_Booker"      :"Squire Booker[au] AND (2019/01/01:2024/04/16[pdat])",
            "Randolph_Hampton"   :"Randolph Hampton[au] AND (2019/01/01:2024/04/16[pdat])",
            "Anne_Spang"         :"Anne Spang[au] AND (2019/01/01:2024/04/16[pdat])",
            "Yasunori_Saheki"    :"Yasunori saheki[au] AND (2019/01/01:2024/04/16[pdat])",
            "Michael_Schlame"    :"Michael Schlame[au] AND (2019/01/01:2024/04/16[pdat])",
            "Brittany_White"     :"Brittany M White[au] AND (2019/01/01:2024/04/16[pdat])",
            "Scott_Hansen"       :"Scott D Hansen[au] AND (2019/01/01:2024/04/16[pdat])",
            "Andre_Nadler"       :"Andre Nadler[au] AND (2019/01/01:2024/04/16[pdat])",
            "Jeffrey_Spraggins"  :"Jeffrey Spraggins[au] AND (2019/01/01:2024/04/16[pdat])",
            "Theodore_Alexandrov":"Theodore Alexandrov[au] AND (2019/01/01:2024/04/16[pdat])AND (EMBL[affil])",
            "Paula_Welander"     :"Paula Welander[au] AND (2019/01/01:2024/04/16[pdat])",
            "Han_Remaut"         :"Han Remaut[au] AND (2019/01/01:2024/04/16[pdat])",
            "Alessio_Accardi"    :"Alessio Accardi[au] AND (2019/01/01:2024/04/16[pdat])"
        }


In [3]:
def get_papers_for_speaker(search_term):
    results = []
    esearch_params = {'term': search_term }
    ncbi_api = EntrezAPI()
    entreze_esearch_data = ncbi_api.entreze_esearch(esearch_params)
    if 'WebEnv' in entreze_esearch_data:
        results = ncbi_api.entreze_efetch(entreze_esearch_data)
    else:
        print("ERROR: entreze_esearch failed")
    return results


In [17]:
papers_for_speakers = {}
for speaker in speakers:
    search_term = speakers[speaker]
    papers_for_speaker = get_papers_for_speaker(search_term)
    papers_for_speaker_df = pd.DataFrame(papers_for_speaker)
    papers_for_speaker_df['pmid'] = papers_for_speaker_df['pmid'].apply(lambda x: 'https://pubmed.ncbi.nlm.nih.gov/' + x)
    papers_for_speaker_df = papers_for_speaker_df.sort_values(by=['impact_factor', 'pub_year'], ascending=[False, False])

    papers_for_speakers[speaker]=papers_for_speaker_df
    print(speaker)

Charlie_Serhan
Lawrence_Marnett
Sarah_Fendt
Meng_Wang
Jenny_Watts
Valerie_Kagan
Shirin_Bahmanyar
Arun_Radhakrishnan
Todd_Graham
Chris_Burd
Jeeyun_Chung
Hanaa_Hariri
Fikadu_Tafesse
Roberto_Zoncu
Kuang_Shen
Alison_Ondrus
Squire_Booker
Randolph_Hampton
Anne_Spang
Yasunori_Saheki
Michael_Schlame
Brittany_White
Check the format of the http request [Retry: 1] code: 429 Client Error: Too Many Requests for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_66245a3ddd5f115f9b628af1&retmode=xml
Scott_Hansen
Andre_Nadler
Jeffrey_Spraggins
Theodore_Alexandrov
Paula_Welander
Han_Remaut
Alessio_Accardi


In [5]:
# TEST ONE
# search_term = "Charles N Serhan[au] AND (2019/01/01:2024/04/16[pdat]) AND (harvard[affil])"
# papers_for_speaker = get_papers_for_speaker(search_term)
# papers_for_speaker

In [18]:
def autofit_columns(worksheet):
    for column in worksheet.columns:
        max_length = 0
        column = [cell for cell in column]
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        if max_length > 150:
            max_length = 150
        adjusted_width = (max_length + 2)  # Adding some extra padding
        worksheet.column_dimensions[get_column_letter(column[0].column)].width = adjusted_width


In [19]:
output_dir = "./output"
if not os.path.exists(output_dir):
    # Create the directory if it does not exist
    os.makedirs(output_dir)

excel_file_path = f"{output_dir}/articles.xlsx"

if os.path.exists(excel_file_path):
    os.remove(excel_file_path)

# Create a new workbook
workbook = Workbook()

# Remove the default "Sheet" created by openpyxl
default_sheet = workbook['Sheet']
#workbook.remove(default_sheet)

with pd.ExcelWriter(excel_file_path, engine='openpyxl') as writer:
    #writer.book = workbook
    for speaker in sorted(papers_for_speakers.keys()):
        df = papers_for_speakers[speaker]
        df.to_excel(writer, sheet_name=speaker, index=False)    
        
    # Autofit and highlight columns for each sheet
    for sheet in writer.sheets.values():
        autofit_columns(sheet)