In [None]:
from bs4 import BeautifulSoup
from time import sleep
import numpy as np
from tqdm import tqdm
import pickle
import pandas as pd
import eutils
import requests
import xml
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

### Data

In [None]:
def check_ortho_terms(word, terms):
    for x in terms:
        if x.lower() in word.lower():
            return True
    return False

ortho_terms = ['spine', 'ortho', 'knee', 'foot', 'musclulo', 'bone']

In [None]:
f = open("medical.txt", "r", encoding="utf8")
abrv_dict = {}
for line in f.readlines():
    line = line.strip().split('\t')
    abrv_dict[line[0].lower()] = line[1:]

In [None]:
names = pd.read_excel('ortho_providers.xlsx')
bad_ortho = pickle.load(open('bad_ortho.pkl', 'rb'))
# abrv_dict = pickle.load(open('abrv_dict.pkl', 'rb'))
ortho_journals_full = pickle.load(open('ortho_journals_full.pkl', 'rb'))
ortho_journals = set([])
bad_count = 0
good_count = 0
for x in bad_ortho:
    # print(x)
    if x.lower() in abrv_dict:
        for i in abrv_dict[x.lower()]:
            ortho_journals.add(i)
        good_count += 1
    else:            
        bad_count += 1

for x in abrv_dict:
    if check_ortho_terms(x, ortho_terms):
        for i in abrv_dict[x]:
            ortho_journals.add(i)

print(bad_count)
print(good_count)

print(len(ortho_journals))

### Eutiles

In [None]:
# Add eutils API key here
ec = eutils.Client(api_key=)

In [None]:
def esearch_query(payload, retmax = 100000, sleep=0.34):
    """
    Return identifiers using the ESearch E-utility.
    """
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    payload['rettype'] = 'xml'
    payload['retmax'] = retmax
    payload['retstart'] = 0
    ids = list()
    count = 1
    while payload['retstart'] < count:
        response = requests.get(url, params=payload)
        tree = xml.etree.ElementTree.fromstring(response.text)
        count = int(tree.findtext('Count'))
        ids += [id_.text for id_ in tree.findall('IdList/Id')]
        payload['retstart'] += retmax
        # print('esearch {:.3%} complete'.format(payload['retstart'] / count), end='\r')
        time.sleep(sleep)
    return ids

In [None]:
# Breaks up a list l into sublists of size n
def divide_chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
# Given a list of PMIDs and ortho journals cross references which of these PMIDs were published in orthopedic journals.
def get_ortho_pubs(pubmed_ids, ortho_journals):
    count = 0
    authors = []
    if len(pubmed_ids) > 0:
        id_lists = list(divide_chunks(list(pubmed_ids), 250))
        for id_chunk in id_lists:
            paset = ec.efetch(db='pubmed', id=id_chunk)
            for pa in paset:
                if pa.jrnl.lower() in ortho_journals or check_ortho_terms(pa.jrnl.lower(), ortho_terms):
                    count += 1
                    authors.append(pa.authors)
        if len(authors) == 0:
            for id_chunk in id_lists:
                paset = ec.efetch(db='pubmed', id=pubmed_ids)
                for pa in paset:
                    authors.append(pa.authors)
    return count, authors

In [None]:
# For an author last name get PMIDs that match the authors last name in ortho journals and then check which author name formats could match the original author
def author_check(author_last, ortho_journals):
    payload = {'db': 'pubmed', 'term': author_last}
    pubmed_ids = set(esearch_query(payload))
    journal_count, authors = get_ortho_pubs(pubmed_ids, ortho_journals)
    viable_authors = {}
    for author_group in authors:
        for author in author_group:
            if author_last in author.lower():
                if author.lower() not in viable_authors:
                    viable_authors[author.lower()] = 1
                else:
                    viable_authors[author.lower()] += 1
    return viable_authors
        


In [None]:
# Finds the best fitting name in Pubmed for a potential orthopedic author
def get_author_name(author):
    viable_authors = author_check(author, ortho_journals)
    max_key =''
    max_value = 0
    if len(viable_authors) > 0:
        for key in viable_authors:
            if viable_authors[key] > max_value:
                max_key = key
                max_value = viable_authors[key]
    return max_key
        

In [None]:
# Runs through the list of all authors under investigation and finds their best matching name.
author_best_names = {}
names_list = []
bad_authors = []
start_name = {}
for index, x in names.iterrows():
    name = x.Name
    name = name.split(",")
    name = name[0].split(" ")
    name = name[-1].lower() + ' ' + name[0][0].lower()
    names_list.append(name)
    start_name[x.Name.split(", ")[0].lower()] = name
for name in tqdm(names_list):
    if name not in author_best_names:
        try:
            search_name = get_author_name(name)
            sleep(1)
            author_best_names[name] = search_name
        except:
            bad_authors.append(name)
        pickle.dump(author_best_names,  open('author_best_names.pkl', 'wb'))
        pickle.dump(bad_authors,  open('bad_authors.pkl', 'wb'))

In [None]:
# For a list of full names of Journals in the field of Orthopedics, change the format to match
# the pubmed abbreviation. 
ortho_abrvs = set([])
ortho_done = set([])
bad_ortho = set([])

for x in tqdm(ortho_journals_full):
    if x not in ortho_done or x in bad_ortho:
        if x[-5:] == ', the':
            x = x[:-5]
        results = ec.esearch(db='pubmed', term=f'{x}[journal]')
        if len(results.ids) != 0:
            paset = ec.efetch(db='pubmed', id=results.ids)
            for pa in paset:
                x = x + ', the'
                ortho_abrvs.add(pa.jrnl)
                ortho_done.add(x)
                if x in bad_ortho:
                    bad_ortho.remove(x)
                break
        else:
            bad_ortho.add(x)
            ortho_done.add(x)

In [None]:
pickle.dump(ortho_abrvs,  open('ortho_abrvs.pkl', 'wb'))
pickle.dump(bad_ortho,  open('bad_ortho.pkl', 'wb'))
pickle.dump(author_best_names,  open('author_best_names.pkl', 'wb'))
author_best_names_reverse = {author_best_names[x]:x for x in author_best_names}

In [38]:
author_best_names_reverse = {author_best_names[x]:x for x in author_best_names}

### Icare

In [None]:
# A basic url request funciton to be used for ICare
def parse2(url):
    service = Service(ChromeDriverManager().install())
    options = Options()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    options.add_argument('--log-level=3')
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    return driver

In [None]:
manually_added_journals = '''Eur Spine J

Bone Res

Clin Spine Surg

J Orthop Surg

J Pediatr Orthop B

Bone Res

J Bone Joint Surg Am

J Spinal Disord Tech

J Orthop Res

Skeletal Radiol

J Orthop Trauma

Iowa Orthop J

Scoliosis Spinal Disorders

Int J Sports Phys Ther

J Shoulder Elbow Surg

Arthroscopy

J Bone Joint Surg Am

J Foot Ankle Surg Am

J Hand Surg Am

Orthop Nurs'''

test = manually_added_journals.split('\n')[::2]
for x in manually_added_journals:
    if x not in ortho_abrvs:
        ortho_abrvs.add(x)

manual_review_added_journals = pd.read_excel('manual_review_added_journals.xlsx')
manual_review_added_journals = manual_review_added_journals.dropna()
manual_review_added_journals = manual_review_added_journals[manual_review_added_journals['Include Journal (Y/N)'] == 'yes']
manual_review_added_journals = list(manual_review_added_journals.Journals.values)
for x in manual_review_added_journals:
    ortho_abrvs.add(x)

In [30]:
# Convert all names to a standardized format
name_dic = {}
for index, x in names.iterrows():
    name = x.Name
    name = name.split(",")
    name = name[0].split(" ")
    name = name[-1].lower() + ' ' + name[0][0].lower()
    name_dic[x.Name.split(", ")[0].lower()] = name

In [31]:
def reverse_check(name):
    name = name_dic[author_best_names_reverse[name]]
    name = name.split(",")
    name = name[0].split(" ")
    name = name[0].lower() + ' ' + name[-1].lower()
    return name


In [None]:
# From a query on ICare, returns which found results were in orthopedic journals.
def find_pmids(soup, ortho_abrvs):
    scripts = soup.find_all('script')
    if len(scripts) == 0:
        wanted_pmids = []
    wanted_pmids = []
    pmids = []
    journal_names = []
    for x in scripts:
        if x.text != '':
            chunks = x.text.split('\n')[1].strip()[19:-1].split(', ')
            for x in chunks:
                values = x.split(': ')
                if values[0].strip('"') == 'journalNameIso':
                    journal_names.append(values[1].strip('"'))
                if values[0].strip('"') == 'pmid':
                    pmids.append(values[1].strip('"'))
            wanted_pmids = []
            for x in range(len(journal_names)):
                if journal_names[x] in ortho_abrvs:
                    wanted_pmids.append(pmids[x])
    return wanted_pmids

In [None]:
# For each author query them on ICare. Then cross reference all publications returned with ortho journals. 
# For those in ortho journals run another ICare query with those PMIDs and store the citation values.

data_points = ['total-pubs', 'pubs-per-year', 'cites-per-year-max', 'cites-per-year-mean', 'cites-per-year-sem', 'cites-per-year-med', 'rcr-max', 'rcr-mean', 'rcr-sem', 'rcr-med', 'wrcr']
author_name_to_values = {}

url = 'https://icite.od.nih.gov/analysis'
driver = parse2(url)

for x in tqdm(name_dic):
    if x not in author_name_to_values:
        driver.find_element('id', 'pmid_query').send_keys(x)
        driver.find_element("id", 'process-btn').click()
        sleep(1)
        sourceCode = driver.page_source
        soup = BeautifulSoup(sourceCode)

        wanted_pmids = find_pmids(soup, ortho_abrvs)

        if len(wanted_pmids) > 0:
            driver.find_element('xpath', '//a[@href="'+'https://icite.od.nih.gov/analysis'+'"]').click()
            pmids_string = ", ".join(wanted_pmids)
            driver.find_element('id', 'pmid_text').send_keys(pmids_string)
            driver.find_element("id", 'process-btn').click()
            sleep(1)
            sourceCode = driver.page_source
            soup = BeautifulSoup(sourceCode)
            author_name_to_values[x] = {data: soup.find_all('td', {'class':data})[0].text for data in data_points}
            driver.find_element('xpath', '//a[@href="'+'https://icite.od.nih.gov/analysis'+'"]').click()
        else:
            driver = parse2(url)
            author_name_to_values[x] = {data: 'N/A' for data in data_points}
        pickle.dump(author_name_to_values, open('author_name_to_values_5.pkl', 'wb'))
