#### DBLP Publication Key Data Spreadsheet Conversion

We will be parsing the DBLP XML file for research publication records and extracting the title, conference, year published, and author tags
With each publication, the script will determine the following:
1. The author(s) affiliations
2. The country in which the affiliated institutions are located (abbreviated)
3. The geographic region in which the affiliated institutions are located (ie. asia, europe, north america)

A final csv including the author, conference, date, school and location details of each article will be created.

In [2]:
# preprocessing
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm
import ast

In [3]:
# Parse DBLP XML file
dblp_data = 'C:/Eric/Projects/datasets/uniranking_data/dblp2.xml'
tree = ET.parse(dblp_data)
root = tree.getroot()

# Load reference data
locations = pd.read_csv('C:/Eric/Projects/AI_Rankings/data/country-info.csv')
csr = pd.read_csv('C:/Eric/Projects/AI_Rankings/data/csrankings.csv')

#### 

In [12]:
# Unpack tile, conference, year, and authors from data
def str_diff(page_num:str):
    try:
        n = page_num.split('-')
        try:
            n[0] = n[0].split(':')[0]
            n[1] = n[1].split(':')[0]
        except:
            pass
        pg = int(n[1]) - int(n[0])
    except:
        pg = 1

    return pg

def unpack(root):
    cols = ['Title','Conference','Year','Authors']
    df = pd.DataFrame(columns=cols)
    rows = []
    pages = []

    for p in root:
        author_list = []
        for author in p.findall('.//author'):
            author_list.append(author.text)

        try:
            page_num = p.find('pages').text
            pages.append(str_diff(page_num))
        except AttributeError:
            pages.append(0)


        row = {'Title': p.find('title').text, 
            'Conference': p.attrib['key'].split('/')[1], 
            'Year': int(p.find('year').text),
            'Authors': author_list,}
        rows.append(row)

    rows_df = pd.DataFrame(rows)
    df = pd.concat([df, rows_df], ignore_index=True)
    return df, pages

publications,pg = unpack(root)

# gather affiliation and location info from author data
inst_list = []
country_list = []
region_list = []

In [14]:
print(len([num for num in pg if num < 6]))

132704


In [5]:
# find country of list of institutions   
def find_inst_country(country_info:pd.DataFrame,inst_list:list):
    countries = []
    for inst in inst_list:
        country = country_info[country_info['institution'] == inst]['countryabbrv']
        if not country.empty:
            countries.append(country.iloc[0])
        else:
            countries.append('us')
    return set(countries)

# find multiple author affiliation
def find_author_affil(authors:list,csr:pd.DataFrame):
    affils = []
    for author in authors:
        try:
            inst = csr[csr['name'] == author]['affiliation']
            affils.append(inst.iloc[0])
        except:
            continue
    return set(affils)

# find region of list of institutions   
def find_country_region(country_info:pd.DataFrame,country_list:list):
    regs = []
    for c in country_list:
        reg = country_info[country_info['countryabbrv'] == c]['region']
        if not reg.empty:
            regs.append(reg.iloc[0])
        else:
            regs.append('us')
    return set(regs)

# map location data of institutions
for _, row in tqdm(publications.iterrows(), total=publications.shape[0]):
    authors = row['Authors']

    affils = find_author_affil(authors,csr)
    inst_list.append(affils)

    countries = find_inst_country(locations,affils)
    country_list.append(countries)

    region = find_country_region(locations,countries)
    region_list.append(region)

publications['Affiliations'] = inst_list
publications['Countries'] = country_list
publications['Region'] = region_list

# Since we are only interested in publications authored by faculty from academic institutions,
# we will filter out publications by authors with no affiliated institutions at the moment (corporate researchers, etc.)
result = publications[publications['Affiliations'].apply(lambda x: len(x) > 0)]

print('Remaining publications:', len(result))

100%|██████████| 371832/371832 [39:59<00:00, 154.95it/s] 


In [None]:
# set data editor
def replace_set(s, country, region):
    if country in s:
        s.remove(country)
        s.add(region)
    return s

def join_authors(authors):
    return ', '.join(authors) if isinstance(authors, list) else authors

# the default region labling by csranking lists us and canada individually
# we will group them together into northamerica for consistency
if any('us' in s for s in result['Region']):
    #result['Region'] = result['Region'].apply(ast.literal_eval)
    result['Region'] = result['Region'].apply(lambda s: replace_set(s, 'us', 'northamerica'))
    result['Region'] = result['Region'].apply(lambda s: replace_set(s, 'canada', 'northamerica'))

# only for formatting purposes we will remove the [] and {} from the sets
# by joining these sets as one concatenated string
result['Affiliations'] = result['Affiliations'].apply(', '.join)
result['Countries'] = result['Countries'].apply(', '.join)
result['Region'] = result['Region'].apply(', '.join)
result['Authors'] = result['Authors'].apply(join_authors)

#### Save as Excel Spreadsheet

In [75]:
# save as csv
filename = "DBLP_publications.csv"
result.to_csv(filename,index=False)

# check memory usage
total_memory_usage = result.memory_usage(deep=True).sum()
print("Total memory usage:", total_memory_usage/1000000, "MB")

Total memory usage: 111.91538 MB
