#### DBLP Publication Key Data Spreadsheet Conversion

We will be parsing the DBLP XML file for research publication records and extracting the title, conference, year published, and author tags
With each publication, the script will determine the following:
1. The author(s) affiliations
2. The country in which the affiliated institutions are located (abbreviated)
3. The geographic region in which the affiliated institutions are located (ie. asia, europe, north america)

A final csv including the author, conference, date, school and location details of each article will be created.

In [1]:
# Libraries and file paths

# Import libs
from lxml import etree
import pandas as pd
from tqdm import tqdm

# Input XML file path, change to your own path
dblp_data = 'C:/Eric/Projects/datasets/uniranking_data/dblp-2024-05-02.xml'

# DTD file path
dtd_path = 'C:/Eric/Projects/datasets/uniranking_data/dblp.dtd'

# Reference data
locations = pd.read_csv('C:/Eric/Projects/AI_Rankings/data/country-info.csv')
csr = pd.read_csv('C:/Eric/Projects/AI_Rankings/data/csrankings.csv')


In [2]:
# This is the set of conferences used by CSRankings
conf_set = {'aaai','acl','asplos','ase',
 'bioinformatics',
 'cav','ccs','ccs2','cgf','chi','corr','crypto','cscw','csl','cvpr',
 'dac',
 'ec','eccv','emnlp','emsoft','entcs','eurocrypt','eurosys',
 'fast','focs',
 'hpca','hpdc','huc',
 'iccad','iccad2','iccv','icde','icfp','iclr','icml','icra','ics','icse','ijcai','ieeessp',
 'imc','imc2','imwut','innovations','intcompsymp','ipps','iros','isca','ismb','issta',
 'kbse','kdd',
 'lics',
 'micro','mobicom','mobisys',
 'naacl','ndss','nips','nsdi','neurips',
 'oopsla','osdi',
 'pacmmod','pacmpl','pervasive','pet','pldi','podc','pods','pomacs','popets','popl','pvldb',
 'recomb','rss','rtas','rtss',
 'sc','sensys','sigcomm','sigcse','sigecom','siggraph','sigir',
 'sigmetrics','sigmod','sigsoft','soda','sosp','sp','spaa','stoc',
 'tcad','tecs','tog','tvcg',
 'uist','usenix','uss',
 'visualization','vldb','vr',
 'wdag','wine','wsdm','www'}

In [5]:
# Parse the XML file

def str_diff(page_num:str):
    try:
        p = page_num.split('-') #if pages in the format 'xxx-xxx'
        try:                    #if pages in the format 'xxx:xxx'
            p[0] = p[0].split(':')[0]
            p[1] = p[1].split(':')[0]
        except:
            pass
        n = int(p[1]) - int(p[0])
    except: #if pages in the format 'xxx'
        n = 1
    return n

# Initialize a DataFrame
cols = ['Title','Conference','Year','Authors']
df = pd.DataFrame(columns=cols)
rows = []

num_papers = 0
# Parse the XML file elementwise
context = etree.iterparse(dblp_data, events=('end',), tag='inproceedings', dtd_validation=True, load_dtd=True, huge_tree=True)
for event, paper in context:
    try:
        conf = paper.attrib['key'].split('/')[1]
        page_num = paper.find("pages").text
        pages = str_diff(page_num) if '-' in page_num else 1

        if conf not in conf_set:
        #if conf not in conf_set or pages < 6: # if we want to specify a minimum number of pages
            continue
        
        num_papers += 1
        title = paper.find("title").text
        year = paper.find("year").text
        authors = [author.text for author in paper.findall("author")]
        row = {'Title': title, 
            'Conference': conf, 
            'Year': year,
            'Authors': authors}
        rows.append(row)

    except:
        pass

    # Free up memory
    paper.clear()
    while paper.getprevious() is not None:
        del paper.getparent()[0]

del context

print("Finished parsing.")
print(f"Found {num_papers} papers.")

# Concatenate the rows to the DataFrame
rows_df = pd.DataFrame(rows)
publications = pd.concat([df, rows_df], ignore_index=True)

Finished parsing.
Found 411568 papers.


In [4]:
# gather affiliation and location info from author data
inst_list = []
country_list = []
region_list = []
# find country of list of institutions   
def find_inst_country(country_info:pd.DataFrame,inst_list:list):
    countries = []
    for inst in inst_list:
        country = country_info[country_info['institution'] == inst]['countryabbrv']
        if not country.empty:
            countries.append(country.iloc[0])
        else:
            countries.append('us')
    return set(countries)

# find multiple author affiliation
def find_author_affil(authors:list,csr:pd.DataFrame):
    affils = []
    for author in authors:
        try:
            inst = csr[csr['name'] == author]['affiliation']
            affils.append(inst.iloc[0])
        except:
            continue
    return set(affils)

# find region of list of institutions   
def find_country_region(country_info:pd.DataFrame,country_list:list):
    regs = []
    for c in country_list:
        reg = country_info[country_info['countryabbrv'] == c]['region']
        if not reg.empty:
            regs.append(reg.iloc[0])
        else:
            regs.append('us')
    return set(regs)

# map location data of institutions
for _, row in tqdm(publications.iterrows(), total=publications.shape[0]):
    authors = row['Authors']

    affils = find_author_affil(authors,csr)
    inst_list.append(affils)

    countries = find_inst_country(locations,affils)
    country_list.append(countries)

    region = find_country_region(locations,countries)
    region_list.append(region)

# Add the affiliation and location info to the DataFrame
publications['Affiliations'] = inst_list
publications['Countries'] = country_list
publications['Region'] = region_list

# Since we are only interested in publications authored by faculty from academic institutions,
# we will filter out publications by authors with no affiliated institutions at the moment (corporate researchers, etc.)
result = publications[publications['Affiliations'].apply(lambda x: len(x) > 0)]

print('Remaining publications:', len(result))

100%|██████████| 411035/411035 [2:20:46<00:00, 48.66it/s]     


Remaining publications: 226718


In [5]:
# spreadsheet formatting
def replace_set(s, country, region):
    if country in s:
        s.remove(country)
        s.add(region)
    return s

def join_authors(authors):
    return ', '.join(authors) if isinstance(authors, list) else authors

# the default region labling by csranking lists us and canada individually
# we will group them together into northamerica for consistency
if any('us' in s for s in result['Region']):
    #result['Region'] = result['Region'].apply(ast.literal_eval)
    result['Region'] = result['Region'].apply(lambda s: replace_set(s, 'us', 'northamerica'))
    result['Region'] = result['Region'].apply(lambda s: replace_set(s, 'canada', 'northamerica'))

# only for formatting purposes we will remove the [] and {} from the sets
# by joining these sets as one concatenated string
result['Affiliations'] = result['Affiliations'].apply(', '.join)
result['Countries'] = result['Countries'].apply(', '.join)
result['Region'] = result['Region'].apply(', '.join)
result['Authors'] = result['Authors'].apply(join_authors)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Region'] = result['Region'].apply(lambda s: replace_set(s, 'us', 'northamerica'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Region'] = result['Region'].apply(lambda s: replace_set(s, 'canada', 'northamerica'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Affiliations'

In [6]:
# save as csv
filename = "DBLP_publications.csv"
result.to_csv(filename,index=False)

# check memory usage
total_memory_usage = result.memory_usage(deep=True).sum()
print("Total memory usage:", total_memory_usage/1000000, "MB")

Total memory usage: 121.13115 MB
