In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [7]:
# Example on finding number of first-authored journal papers of ALexander Jung in the period 2015-2019

lower = 2015
upper = 2019

url = "https://dblp.uni-trier.de/pers/hd/j/Jung:Alexander"
response = requests.get(url).text
soup = BeautifulSoup(response, 'lxml')

In [9]:
# Find HTML elements: journal, first_author, year

publications = soup.find_all('li', class_='entry article toc')

count = 0
for pub in publications:
    div = pub.find('div', class_='nr')
    if '[j' not in div.text:
        continue
    year = int(pub.find('span', itemprop='datePublished').text)
    if not(lower <= year <= upper):
        continue
    first_author = pub.find('span', itemprop="author")
    if first_author.find('span', class_="this-person") == None:
        continue
    print(div.text, year)
    count += 1 
print(count)

[j12] 2019
[j10] 2019
[j8] 2016
[j7] 2015
[j6] 2015
5


In [11]:
# Get names stored in text file 
# This is an optional function in case you want to store researchers' name in text file
def get_name(file_name):
    """
    Get researchers' names from text file
    :param file_name: name of the file storing researchers' names
    :return name_list: list of names
    """

    list_names = []
    with open(file_name, 'r') as file:
        for name in file:
            name = name.strip()
            list_names.append(name)

    return list_names

In [13]:
# Get content from website: "https://dblp.uni-trier.de"
def get_website_content(full_name):
    """
    Get content of the website given name researchers
    :param name: full name applicants in the form "<First name> <Surname/Lastname>" (e.g "Alexander Jung")
    :return: BeautifulSoup
    """

    names = full_name.split(' ')
    last = names[-1]
    first = names[0] 
    if len(names) > 2:
        first = '_'.join(names[:-1])

    url = f"https://dblp.uni-trier.de/pers/hd/{last[0].lower()}/{last}:{first}"
    
    # Person cannot be found
    if requests.get(url).status_code == 404:
        return None

    response = requests.get(url).text
    return BeautifulSoup(response, 'lxml')

In [14]:
def count_journal(soup, lower, upper):
    """
    return number of first_authored journal given the time period
    :param soup: get from BeautifulSoup from requests
    :param upper: start year
    :param lower: end year
    return: number of journals
    """

    publications = soup.find_all('li', class_='entry article toc')

    count = 0
    for pub in publications:
        div = pub.find('div', class_='nr')
        if '[j' not in div.text:
            continue
        year = int(pub.find('span', itemprop='datePublished').text)
        if not(lower <= year <= upper):
            continue
        first_author = pub.find('span', itemprop="author")
        if first_author.find('span', class_="this-person") == None:
            continue
        count += 1 

    return count

In [15]:
# name_list = get_name("applicant_list.txt") # Using this function if names are store in text file
name_list = ["Johan Waara", "Terje Aaberge", "Gabor Hannak", "Alexander Jung", "Minh Thanh Vu", "Duy Vu"]

start_period = 2001
end_period = 2020

num_journal = []
for name in name_list:
    soup = get_website_content(name)
    if soup:
        num_journal.append(count_journal(soup, start_period, end_period))
    else:
        num_journal.append(0)

df = pd.DataFrame({"Researcher name": name_list, "Number of first-authored journal papers": num_journal})
df

Unnamed: 0,Researcher name,Number of first-authored journal papers
0,Johan Waara,0
1,Terje Aaberge,2
2,Gabor Hannak,1
3,Alexander Jung,9
4,Minh Thanh Vu,1
5,Duy Vu,2


In [16]:
# ALgin content in dataframe to the left
df_left_align = df.style.set_properties(**{'text-align': 'left'})
df_left_align.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
df_left_align

Unnamed: 0,Researcher name,Number of first-authored journal papers
0,Johan Waara,0
1,Terje Aaberge,2
2,Gabor Hannak,1
3,Alexander Jung,9
4,Minh Thanh Vu,1
5,Duy Vu,2
