## Retrieve and filter papers from DBLP
The purpose of this notebook is to use DBLP to contain a list of refereed papers from the venues included on the CSmetrics website. Also included are some comparisons of several of different filtering methods.

In [51]:
import os, json, requests, csv, re
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

from paper_filtering import filter_by_header
from paper_filtering import filter_by_page_number_keep_missing
from paper_filtering import filter_by_page_number_remove_missing
from paper_filtering import filter_by_header_and_page_number_keep_missing
from paper_filtering import filter_by_header_and_page_number_remove_missing
from paper_filtering import filter_journals
from paper_filtering import apply_filter_to_papers

In [52]:
# filename templates for the data
dblp_raw_filename = lambda name,year: os.path.join( "DBLP_raw_data" , 
                                              "{}_{}_raw_dblp_papers.json".format(name,year) )

# filepath to plot comparing different filtering methods
filtering_count_comparison_fiename = os.path.join(os.pardir, "docs","filtering_count_comparison.png")

# filepath to the file containing venue names and their categories (conf or journal)
venue_category_filename = os.path.join(os.pardir,"app","data","venue_list.csv")

# filepath to the file containing a list of venues to be scraped using different keys
venues_with_different_keys_filename = "venues_with_different_dblp_keys.csv"

# filepath to the file containing a list of venues to be scraped using a different base url
venues_with_different_baseurls = "venues_with_different_dblp_baseurls.csv"

# filepath to the file containing a list of journals to be scraped using a different method
journals_to_be_scraped_using_alternative_method = "journals_to_use_alternative_scraping_method.csv"

# filepath to the file containing the publication and filtering count for the data set
pub_filtering_counts_filename = "PublicationFilteringCounts.csv"

#### Function for scraping papers for a given conference and year

In [37]:
# Simple function to check whether a beautiful soup element has a given attribute
def has_attribute(element, attr):
    try:
        element[attr]
        return True
    except:
        return False

def get_papers_from_page(url, year, params = {}):
    # DBLP url template for the website scraped

    content = requests.get(url).text

    soup = BeautifulSoup(content, 'html.parser')
    classnames = ["entry article", "entry inproceedings"]
    
    papers = {"noheader":[]}
    header = None

    # Iterate through the children of the main content area
    for element in soup.find(id="main").children:
        
        # Get the name of the upcoming section of the proceedings
        if element.name == "header":

            # based on observations this is the correct level of heading
            h2 = element.find("h2")
            if h2:
                header = h2.text.strip()  # Strip any extra spaces
                if header not in papers:
                    papers[header] = []
            
            continue

        # Check if the element is a publication list
        if element.name == "ul" and "publ-list" in element.get("class", []):
            # Observed that all paper list items have the class (that arent't the proceedings)
            #  have the class "entry inproceedings" as opposed to "entry editor"
            for data in element.find_all("li", class_=["entry article", "entry inproceedings"]):
#                 print("data:", data)
                # Extract the title of the paper
                title = data.find("span", attrs={"class": "title"}).text
#                 print("title:", title)

                # do not include front matter
                if "front matter" in title.lower():
                    continue 

                # do not include key notes
                if "keynote" in title.lower():
                    continue 

                pagination = data.find("span", attrs={"itemprop": "pagination"})
                if pagination is not None:
                    pagination = pagination.text

                doi_a_tag = data.select_one('li.drop-down div.head a')
                doi = doi_a_tag['href']

                year = year

                authors = []
                for authorspan in data.find_all("span", attrs={"itemprop": "author"}):
                    authors.append(authorspan.find("span", attrs={"itemprop": "name"}).text)

                paper = {
                    "title": title,
                    "doi": doi,
                    "pagination": pagination,
                    "year": year,
                    "authors": authors}

                # if a section header has not been identified for this list of papers
                if header == None: 
                    papers["noheader"].append(paper)

                else:
                    papers[header].append(paper)

    return papers

            
            
def get_papers_from_dblp(name, year, force=False, save=True, groupname=None):
    
    # file name for saved data
    filename = dblp_raw_filename(name,year)

    # check whether the file already exists
    exists = os.path.exists(filename)
    
    # scrape the data if it doesnt exist or instructed to force/overwrite file
    if force or not exists:
        
        if groupname == None: groupname = name
        
        url = "https://dblp.org/db/conf/{0}/{1}{2}.html".format(groupname, name,year)
        
        papers = get_papers_from_page(url,year)

        
        try_next_volume = True
        next_volume = 1

        while try_next_volume:

            url = "https://dblp.org/db/conf/{0}/{1}{2}-{3}.html".format(groupname,name,year,next_volume)

            papers_volume = get_papers_from_page(url, year)

            for header, paperlist in papers_volume.items():
                if header in papers:
                    papers[header].extend(paperlist)
                else:
                    papers[header] = paperlist

            # if found no papers from this volume, and tried up to volume 2
            if sum([len(v) for v in papers_volume.values()]) == 0 and next_volume >= 2:
                try_next_volume = False

            # otherwise look for a next volume
            else:
                next_volume += 1
                    
            
        if save:
            with open(filename, "w") as fh:
                json.dump(papers, fh)

    # otherwise load the existing file
    else:
        with open(filename, "r") as fh:
            papers = json.load(fh)


    return papers

In [38]:
def get_links_to_journal_volumes(name,year, alternative_method = False):
    # DBLP url template for the journal homesite
    url = "https://dblp.org/db/journals/{0}/".format(name)

    content = requests.get(url).text
    soup = BeautifulSoup(content)

    output = []
    
    for element in soup.find(id="main").children:
        if element.name=="ul":
            for listitem in element.children:
                if listitem.find("a") != -1:
                    
                    if alternative_method:
                        if str(year) in listitem.contents[0]:
                            for link in listitem.find_all('a'):
                                output.append({"url": link["href"], "text": listitem.contents[0] + link.text})

                    link = listitem.find("a")
                    if link is None:
                        continue
                    if str(year) in link.text:
                        output.append({"url": link["href"], "text": link.text})
                        
                        

    return output


def get_journal_papers_from_dblp(name, year, force=False, save=True, links=None, alt=False):
    
    # file name for saved data
    filename = dblp_raw_filename(name,year)

    # check whether the file already exists
    exists = os.path.exists(filename)
    
    # scrape the data if it doesnt exist or instructed to force/overwrite file
    if force or not exists:

        papers = {}
        
        if links == None:
            links = get_links_to_journal_volumes(name,year, alternative_method=alt)
        
        for volume in links:
            
            url = volume["url"]

            content = requests.get(url).text
            soup = BeautifulSoup(content)

            header = volume["text"]

            for element in soup.find(id="main").children:

                # Get the name of the upcoming section of the proceedings
                if element.name=="header":

                    # based on observations this is the correct level of heading
                    h2 = element.find("h2")
                    if h2:
                        header = h2.text
                        if header not in papers and str(year) in header:
                            papers[header] = []

                    continue

                # Get papers if the element is a publication list
                if element.name == "ul" and "publ-list" in element.get("class", []):
                    
                    # if header doesnt indicate it is in the desired year, continue
                    # if str(year) not in header:
                    #    continue
                    
                    # Observed that all paper list items have the class (that arent't the proceedings)
                    #  have the class "entry inproceedings" as opposed to "entry editor"
                    for data in element.find_all("li", attrs={"entry article"}):
                        title = data.find("span", attrs={"class": "title"}).text

                        # do not include front matter
                        if "front matter" in title.lower():
                            continue 

                        # do not include key notes
                        if "keynote" in title.lower():
                            continue 

                        pagination = data.find("span", attrs={"itemprop": "pagination"})
                        if pagination is not None:
                            pagination = pagination.text
                            
                        doi_a_tag = data.select_one('li.drop-down div.head a')
                        doi = doi_a_tag['href']

                        year = year

                        authors = []
                        for authorspan in data.find_all("span", attrs={"itemprop": "author"}):
                            authors.append(authorspan.find("span", attrs={"itemprop": "name"}).text)

                        paper = {
                            "title": title,
                            "doi": doi,
                            "pagination": pagination,
                            "year": year,
                            "authors": authors}

                        papers[header].append(paper)


        if save:
            with open(filename, "w") as fh:
                json.dump(papers, fh)

    # otherwise load the existing file
    else:
        with open(filename, "r") as fh:
            papers = json.load(fh)


    return papers

## Get raw paper files for all venues

In [47]:
with open(pub_filtering_counts_filename, "w") as fh:
    writer = csv.writer(fh, delimiter=",")
    header = ["name", "year", "total publications", "filtered publications"]
    writer.writerow(header)

with open(venue_category_filename, "r") as fh:
    
    reader = csv.reader(fh, delimiter=",")
    
    # skip header row
    next(reader)
    
    for row in reader:
        
        venue_type = row[4]   
        name = row[0]
    
        # set which paper getting / filtering functions to use
        if venue_type == "conference":
            get_papers_f = get_papers_from_dblp
            filter_f = filter_by_header_and_page_number_keep_missing
        else: # is journal
            get_papers_f = get_journal_papers_from_dblp
            filter_f = filter_journals

        # 2007-2024
        for year in range(2007,2024):
            print(name, year)
            # Scrape and save the papers if not already saved
            papers = get_papers_f(name,year)
                
            included, excluded = apply_filter_to_papers(filter_f, papers, name, year)
            totalpubs = len(included) + len(excluded)
            filteredpubs = len(included)
            with open(pub_filtering_counts_filename, "a") as fh:
                writer = csv.writer(fh, delimiter=",")
                outrow = [name, year, totalpubs, filteredpubs]
                writer.writerow(outrow)
            #print(outrow)

3dim 2007
3dim 2008
3dim 2009
3dim 2010
3dim 2011
3dim 2012
3dim 2013
3dim 2014
3dim 2015
3dim 2016
3dim 2017
3dim 2018
3dim 2019
3dim 2020
3dim 2021
3dim 2022
3dim 2023
aaai 2007
aaai 2008
aaai 2009
aaai 2010
aaai 2011
aaai 2012
aaai 2013
aaai 2014
aaai 2015
aaai 2016
aaai 2017
aaai 2018
aaai 2019
aaai 2020
aaai 2021
aaai 2022
aaai 2023
aamas 2007
aamas 2008
aamas 2009
aamas 2010
aamas 2011
aamas 2012
aamas 2013
aamas 2014
aamas 2015
aamas 2016
aamas 2017
aamas 2018
aamas 2019
aamas 2020
aamas 2021
aamas 2022
aamas 2023
acl 2007
acl 2008
acl 2009
acl 2010
acl 2011
acl 2012
acl 2013
acl 2014
acl 2015
acl 2016
acl 2017
acl 2018
acl 2019
acl 2020
acl 2021
acl 2022
acl 2023
ai 2007
ai 2008
ai 2009
ai 2010
ai 2011
ai 2012
ai 2013
ai 2014
ai 2015
ai 2016
ai 2017
ai 2018
ai 2019
ai 2020
ai 2021
ai 2022
ai 2023
aiccsa 2007
aiccsa 2008
aiccsa 2009
aiccsa 2010
aiccsa 2011
aiccsa 2012
aiccsa 2013
aiccsa 2014
aiccsa 2015
aiccsa 2016
aiccsa 2017
aiccsa 2018
aiccsa 2019
aiccsa 2020
aiccsa 2021
aicc

cse 2018
cse 2019
cse 2020
cse 2021
cse 2022
cse 2023
csfw 2007
csfw 2008
csfw 2009
csfw 2010
csfw 2011
csfw 2012
csfw 2013
csfw 2014
csfw 2015
csfw 2016
csfw 2017
csfw 2018
csfw 2019
csfw 2020
csfw 2021
csfw 2022
csfw 2023
csur 2007
csur 2008
csur 2009
csur 2010
csur 2011
csur 2012
csur 2013
csur 2014
csur 2015
csur 2016
csur 2017
csur 2018
csur 2019
csur 2020
csur 2021
csur 2022
csur 2023
cvpr 2007
cvpr 2008
cvpr 2009
cvpr 2010
cvpr 2011
cvpr 2012
cvpr 2013
cvpr 2014
cvpr 2015
cvpr 2016
cvpr 2017
cvpr 2018
cvpr 2019
cvpr 2020
cvpr 2021
cvpr 2022
cvpr 2023
dac 2007
dac 2008
dac 2009
dac 2010
dac 2011
dac 2012
dac 2013
dac 2014
dac 2015
dac 2016
dac 2017
dac 2018
dac 2019
dac 2020
dac 2021
dac 2022
dac 2023
datamine 2007
datamine 2008
datamine 2009
datamine 2010
datamine 2011
datamine 2012
datamine 2013
datamine 2014
datamine 2015
datamine 2016
datamine 2017
datamine 2018
datamine 2019
datamine 2020
datamine 2021
datamine 2022
datamine 2023
date 2007
date 2008
date 2009
date 2010
date 

icassp 2022
icassp 2023
icc 2007
icc 2008
icc 2009
icc 2010
icc 2011
icc 2012
icc 2013
icc 2014
icc 2015
icc 2016
icc 2017
icc 2018
icc 2019
icc 2020
icc 2021
icc 2022
icc 2023
iccad 2007
iccad 2008
iccad 2009
iccad 2010
iccad 2011
iccad 2012
iccad 2013
iccad 2014
iccad 2015
iccad 2016
iccad 2017
iccad 2018
iccad 2019
iccad 2020
iccad 2021
iccad 2022
iccad 2023
icccn 2007
icccn 2008
icccn 2009
icccn 2010
icccn 2011
icccn 2012
icccn 2013
icccn 2014
icccn 2015
icccn 2016
icccn 2017
icccn 2018
icccn 2019
icccn 2020
icccn 2021
icccn 2022
icccn 2023
iccv 2007
iccv 2008
iccv 2009
iccv 2010
iccv 2011
iccv 2012
iccv 2013
iccv 2014
iccv 2015
iccv 2016
iccv 2017
iccv 2018
iccv 2019
iccv 2020
iccv 2021
iccv 2022
iccv 2023
icdcs 2007
icdcs 2008
icdcs 2009
icdcs 2010
icdcs 2011
icdcs 2012
icdcs 2013
icdcs 2014
icdcs 2015
icdcs 2016
icdcs 2017
icdcs 2018
icdcs 2019
icdcs 2020
icdcs 2021
icdcs 2022
icdcs 2023
icde 2007
icde 2008
icde 2009
icde 2010
icde 2011
icde 2012
icde 2013
icde 2014
icde 2015
ic

itc 2016
itc 2017
itc 2018
itc 2019
itc 2020
itc 2021
itc 2022
itc 2023
iticse 2007
iticse 2008
iticse 2009
iticse 2010
iticse 2011
iticse 2012
iticse 2013
iticse 2014
iticse 2015
iticse 2016
iticse 2017
iticse 2018
iticse 2019
iticse 2020
iticse 2021
iticse 2022
iticse 2023
itng 2007
itng 2008
itng 2009
itng 2010
itng 2011
itng 2012
itng 2013
itng 2014
itng 2015
itng 2016
itng 2017
itng 2018
itng 2019
itng 2020
itng 2021
itng 2022
itng 2023
itpro 2007
itpro 2008
itpro 2009
itpro 2010
itpro 2011
itpro 2012
itpro 2013
itpro 2014
itpro 2015
itpro 2016
itpro 2017
itpro 2018
itpro 2019
itpro 2020
itpro 2021
itpro 2022
itpro 2023
iui 2007
iui 2008
iui 2009
iui 2010
iui 2011
iui 2012
iui 2013
iui 2014
iui 2015
iui 2016
iui 2017
iui 2018
iui 2019
iui 2020
iui 2021
iui 2022
iui 2023
jacm 2007
jacm 2008
jacm 2009
jacm 2010
jacm 2011
jacm 2012
jacm 2013
jacm 2014
jacm 2015
jacm 2016
jacm 2017
jacm 2018
jacm 2019
jacm 2020
jacm 2021
jacm 2022
jacm 2023
jair 2007
jair 2008
jair 2009
jair 2010
jair

pvldb 2021
pvldb 2022
pvldb 2023
raid 2007
raid 2008
raid 2009
raid 2010
raid 2011
raid 2012
raid 2013
raid 2014
raid 2015
raid 2016
raid 2017
raid 2018
raid 2019
raid 2020
raid 2021
raid 2022
raid 2023
re 2007
re 2008
re 2009
re 2010
re 2011
re 2012
re 2013
re 2014
re 2015
re 2016
re 2017
re 2018
re 2019
re 2020
re 2021
re 2022
re 2023
recomb 2007
recomb 2008
recomb 2009
recomb 2010
recomb 2011
recomb 2012
recomb 2013
recomb 2014
recomb 2015
recomb 2016
recomb 2017
recomb 2018
recomb 2019
recomb 2020
recomb 2021
recomb 2022
recomb 2023
rss 2007
rss 2008
rss 2009
rss 2010
rss 2011
rss 2012
rss 2013
rss 2014
rss 2015
rss 2016
rss 2017
rss 2018
rss 2019
rss 2020
rss 2021
rss 2022
rss 2023
rtas 2007
rtas 2008
rtas 2009
rtas 2010
rtas 2011
rtas 2012
rtas 2013
rtas 2014
rtas 2015
rtas 2016
rtas 2017
rtas 2018
rtas 2019
rtas 2020
rtas 2021
rtas 2022
rtas 2023
rtss 2007
rtss 2008
rtss 2009
rtss 2010
rtss 2011
rtss 2012
rtss 2013
rtss 2014
rtss 2015
rtss 2016
rtss 2017
rtss 2018
rtss 2019
rtss

tmc 2019
tmc 2020
tmc 2021
tmc 2022
tmc 2023
tmm 2007
tmm 2008
tmm 2009
tmm 2010
tmm 2011
tmm 2012
tmm 2013
tmm 2014
tmm 2015
tmm 2016
tmm 2017
tmm 2018
tmm 2019
tmm 2020
tmm 2021
tmm 2022
tmm 2023
tochi 2007
tochi 2008
tochi 2009
tochi 2010
tochi 2011
tochi 2012
tochi 2013
tochi 2014
tochi 2015
tochi 2016
tochi 2017
tochi 2018
tochi 2019
tochi 2020
tochi 2021
tochi 2022
tochi 2023
tocl 2007
tocl 2008
tocl 2009
tocl 2010
tocl 2011
tocl 2012
tocl 2013
tocl 2014
tocl 2015
tocl 2016
tocl 2017
tocl 2018
tocl 2019
tocl 2020
tocl 2021
tocl 2022
tocl 2023
tocs 2007
tocs 2008
tocs 2009
tocs 2010
tocs 2011
tocs 2012
tocs 2013
tocs 2014
tocs 2015
tocs 2016
tocs 2017
tocs 2018
tocs 2019
tocs 2020
tocs 2021
tocs 2022
tocs 2023
toct 2007
toct 2008
toct 2009
toct 2010
toct 2011
toct 2012
toct 2013
toct 2014
toct 2015
toct 2016
toct 2017
toct 2018
toct 2019
toct 2020
toct 2021
toct 2022
toct 2023
todaes 2007
todaes 2008
todaes 2009
todaes 2010
todaes 2011
todaes 2012
todaes 2013
todaes 2014
todaes 20

iclr 2007
iclr 2008
iclr 2009
iclr 2010
iclr 2011
iclr 2012
iclr 2013
iclr 2014
iclr 2015
iclr 2016
iclr 2017
iclr 2018
iclr 2019
iclr 2020
iclr 2021
iclr 2022
iclr 2023
tmi 2007
tmi 2008
tmi 2009
tmi 2010
tmi 2011
tmi 2012
tmi 2013
tmi 2014
tmi 2015
tmi 2016
tmi 2017
tmi 2018
tmi 2019
tmi 2020
tmi 2021
tmi 2022
tmi 2023
pacmhci 2007
pacmhci 2008
pacmhci 2009
pacmhci 2010
pacmhci 2011
pacmhci 2012
pacmhci 2013
pacmhci 2014
pacmhci 2015
pacmhci 2016
pacmhci 2017
pacmhci 2018
pacmhci 2019
pacmhci 2020
pacmhci 2021
pacmhci 2022
pacmhci 2023
ucc 2007
ucc 2008
ucc 2009
ucc 2010
ucc 2011
ucc 2012
ucc 2013
ucc 2014
ucc 2015
ucc 2016
ucc 2017
ucc 2018
ucc 2019
ucc 2020
ucc 2021
ucc 2022
ucc 2023
bdc 2007
bdc 2008
bdc 2009
bdc 2010
bdc 2011
bdc 2012
bdc 2013
bdc 2014
bdc 2015
bdc 2016
bdc 2017
bdc 2018
bdc 2019
bdc 2020
bdc 2021
bdc 2022
bdc 2023
icsa 2007
icsa 2008
icsa 2009
icsa 2010
icsa 2011
icsa 2012
icsa 2013
icsa 2014
icsa 2015
icsa 2016
icsa 2017
icsa 2018
icsa 2019
icsa 2020
icsa 2021


In [17]:
data = pd.read_csv(pub_filtering_counts_filename)

In [18]:
data["iszero"] = data["filtered publications"] ==0
data.groupby("name").sum().sort_values("iszero", ascending=False)[:50]

Unnamed: 0_level_0,year,total publications,filtered publications,iszero
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
wcci,34255,31,31,15
ismb,34255,118,118,15
digitel,34255,199,112,13
mobisec,34255,66,63,13
civr,34255,300,259,13
hipeac,34255,119,115,12
grid,34255,230,182,11
saint,34255,442,222,11
iaai,34255,138,137,11
tslp,34255,69,65,10


### Extra scraping for missed papers
Papers for some venues were not accurately retrieved in the scraping process for a number of different reasons. The following few code blocks were used to address these shortfalls after the problems were identified through checking venues with many years of zero paper counts.

#### Venues with different keys used in our database compared to the key used by DBLP
A list of venues where the keys contained in `/app/data/venue_list.csv` are not the same as the key used by DBLP to represent the same venue is contained in `/data/venues_with_different_dblp_keys.csv`. The following code block retrieves the papers from DBLP (by scraping the html) using the DBLP key and saves the file using the relevant key listed in `venue_list.csv` in the name of file that the papers are saved in.

In [19]:
run_different_key_scraping = True

if run_different_key_scraping:
    
    with open(venues_with_different_keys_filename, "r") as fh:
        reader = csv.reader(fh, delimiter = ",")

        next(reader)

        for key, dblp_key, venue_type, period_start, period_end in reader:

            get_papers_function = get_journal_papers_from_dblp if venue_type == "journal" else get_papers_from_dblp

            for year in range(int(period_start), int(period_end) + 1):

                filename = dblp_raw_filename(key, year)
                print(key, year)

                # check whether the file already exists
                exists = os.path.exists(filename)
                if exists:
                    continue
                
                papers = get_papers_function(dblp_key, year, force=True, save=False)

                with open(filename,"w") as papers_fh:
                    json.dump(papers, papers_fh)

jmicro 2007
jmicro 2008
jmicro 2009
jmicro 2010
jmicro 2011
jmicro 2012
jmicro 2013
jmicro 2014
jmicro 2015
jmicro 2016
jmicro 2017
jmicro 2018
jmicro 2019
jmicro 2020
jmicro 2021
jmicro 2022
jmicro 2023
ccc 2007
ccc 2008
ccc 2009
ccc 2010
ccc 2011
ccc 2012
ccc 2013
ccc 2014
ccc 2015
ccc 2016
ccc 2017
ccc 2018
ccc 2019
ccc 2020
ccc 2021
ccc 2022
ccc 2023
ispa 2013
ispa 2016
lics 2014


#### Venues that did did not fit the general DBLP url template
A list of conferences* where the url for the DBLP data did not fit the simple format matched by most conferences, that is https://dblp.org/db/conf/{key}/{key}{year}.html, can be found at `/data/venues_with_different_dblp_baseurls.csv`. Generally this occurs where DBLP has group multiple conferences/workshops together under one group key and then use the conference key to specify after that, i.e. https://dblp.org/db/conf/{groupkey}/{conferencekey}{year}.html. The following code block retrieves the papers from DBLP using the correct specified url base and then saves them using the given key for each year in the year period given.


_*This list only contains conferences because the method used to scrape the journals makes this problem unique to the conferences. More specifically, the method for journals does not use a set template for each (venue, year) tuple because the journals are stored by volume number rather than year._

In [54]:
run_custom_url_scraping = True

if run_custom_url_scraping:
    
    with open(venues_with_different_baseurls, "r") as fh:
        reader = csv.reader(fh, delimiter = ",")
        next(reader)

        for key, urlbase, period_start, period_end in reader:
            for year in range(int(period_start), int(period_end) + 1):
                filename = dblp_raw_filename(key, year)
#                 print(key, year)

                # check whether the file already exists
                exists = os.path.exists(filename)
#                 if exists:
#                     continue
                    
                if "----" in urlbase:
                    urls = urlbase.split("----")
                else:
                    urls = [urlbase]
    
                papers = {}
                for u1 in urls:
                    url = u1.format(year)
                    new_papers = get_papers_from_page(url, year)
                    
                    print(key, year, url)
                    # merge papers
                    for k, v in new_papers.items():
                        if k in papers and isinstance(v, list):
                            papers[k].extend(v)
                        else:
                            papers[k] = v

                with open(filename,"w") as papers_fh:
                    json.dump(papers, papers_fh)

ase 2007 https://dblp.org/db/conf/kbse/ase2007.html
ase 2008 https://dblp.org/db/conf/kbse/ase2008.html
ase 2009 https://dblp.org/db/conf/kbse/ase2009.html
ase 2010 https://dblp.org/db/conf/kbse/ase2010.html
ase 2011 https://dblp.org/db/conf/kbse/ase2011.html
ase 2012 https://dblp.org/db/conf/kbse/ase2012.html
ase 2013 https://dblp.org/db/conf/kbse/ase2013.html
ase 2014 https://dblp.org/db/conf/kbse/ase2014.html
ase 2015 https://dblp.org/db/conf/kbse/ase2015.html
ase 2016 https://dblp.org/db/conf/kbse/ase2016.html
ase 2017 https://dblp.org/db/conf/kbse/ase2017.html
ase 2018 https://dblp.org/db/conf/kbse/ase2018.html
ase 2019 https://dblp.org/db/conf/kbse/ase2019.html
ase 2020 https://dblp.org/db/conf/kbse/ase2020.html
ase 2021 https://dblp.org/db/conf/kbse/ase2021.html
ase 2022 https://dblp.org/db/conf/kbse/ase2022.html
ase 2023 https://dblp.org/db/conf/kbse/ase2023.html
disc 2007 https://dblp.org/db/conf/wdag/disc2007.html
disc 2008 https://dblp.org/db/conf/wdag/disc2008.html
disc 200

ssd 2010 https://dblp.org/db/conf/ssd/sstd2010.html
ssd 2011 https://dblp.org/db/conf/ssd/sstd2011.html
ssd 2012 https://dblp.org/db/conf/ssd/sstd2012.html
ssd 2013 https://dblp.org/db/conf/ssd/sstd2013.html
ssd 2014 https://dblp.org/db/conf/ssd/sstd2014.html
ssd 2015 https://dblp.org/db/conf/ssd/sstd2015.html
ssd 2016 https://dblp.org/db/conf/ssd/sstd2016.html
ssd 2017 https://dblp.org/db/conf/ssd/sstd2017.html
ssd 2018 https://dblp.org/db/conf/ssd/sstd2018.html
ssd 2019 https://dblp.org/db/conf/ssd/sstd2019.html
ssd 2020 https://dblp.org/db/conf/ssd/sstd2020.html
ssd 2021 https://dblp.org/db/conf/ssd/sstd2021.html
ssd 2022 https://dblp.org/db/conf/ssd/sstd2022.html
ssd 2023 https://dblp.org/db/conf/ssd/sstd2023.html
csfw 2007 https://dblp.org/db/conf/csfw/csf2007.html
csfw 2008 https://dblp.org/db/conf/csfw/csf2008.html
csfw 2009 https://dblp.org/db/conf/csfw/csf2009.html
csfw 2010 https://dblp.org/db/conf/csfw/csf2010.html
csfw 2011 https://dblp.org/db/conf/csfw/csf2011.html
csfw 20

jeric 2011 https://dblp.uni-trier.de/db/journals/jeric/toce11.html
jeric 2012 https://dblp.uni-trier.de/db/journals/jeric/toce12.html
jeric 2013 https://dblp.uni-trier.de/db/journals/jeric/toce13.html
jeric 2014 https://dblp.uni-trier.de/db/journals/jeric/toce14.html
jeric 2015 https://dblp.uni-trier.de/db/journals/jeric/toce15.html
jeric 2016 https://dblp.uni-trier.de/db/journals/jeric/toce16.html
jeric 2017 https://dblp.uni-trier.de/db/journals/jeric/toce17.html
jeric 2018 https://dblp.uni-trier.de/db/journals/jeric/toce18.html
jeric 2019 https://dblp.uni-trier.de/db/journals/jeric/toce19.html
jeric 2020 https://dblp.uni-trier.de/db/journals/jeric/toce20.html
jeric 2021 https://dblp.uni-trier.de/db/journals/jeric/toce21.html
jeric 2022 https://dblp.uni-trier.de/db/journals/jeric/toce22.html
jeric 2023 https://dblp.uni-trier.de/db/journals/toce/toce23.html
jeric 2024 https://dblp.uni-trier.de/db/journals/toce/toce24.html
conext 2023 https://dblp.uni-trier.de/db/conf/conext/conext2023c

#### Journals that needed to be retrieved through an alternative scraping method
A list of journals where that needed to be scraped using an alternative method is at `/data/journals_to_use_alternative_scraping_method.csv`. The webpages for these journals listed the volumes in a different way and therefore were not picked up by the standard scraping method used for most journals. The code block below retrieves and saves the papers for these journals for each of the respective journals in each year between (inclusive) the period defined.

In [17]:
run_alt_journal_scraping = True

if run_alt_journal_scraping:
    
    with open(journals_to_be_scraped_using_alternative_method, "r") as fh:
        reader = csv.reader(fh, delimiter = ",")

        next(reader)

        for key, period_start, period_end in reader:

            for year in range(int(period_start), int(period_end) + 1):
                filename = dblp_raw_filename(key, year)
                print(key, year)

                # check whether the file already exists
                exists = os.path.exists(filename)
#                 if exists:
#                     continue
                    
                papers = get_journal_papers_from_dblp(key, year, force=True,save=False,alt=True)

                with open(filename,"w") as papers_fh:
                    json.dump(papers, papers_fh)

ml 2007
ml 2008
ml 2009
ml 2010
ml 2011
ml 2012
ml 2013
ml 2014
ml 2015
ml 2016
ml 2017
ml 2018
ml 2019
ml 2020
ml 2021
ml 2022
ml 2023
jair 2007
jair 2008
jair 2009
jair 2010
jair 2011
jair 2012
jair 2013
jair 2014
jair 2015
jair 2016
jair 2017
jair 2018
jair 2019
jair 2020
jair 2021
jair 2022
jair 2023
ai 2007
ai 2008
ai 2009
ai 2010
ai 2011
ai 2012
ai 2013
ai 2014
ai 2015
ai 2016
ai 2017
ai 2018
ai 2019
ai 2020
ai 2021
ai 2022
ai 2023
amai 2007
amai 2008
amai 2009
amai 2010
amai 2011
amai 2012
amai 2013
amai 2014
amai 2015
amai 2016
amai 2017
amai 2018
amai 2019
amai 2020
amai 2021
amai 2022
amai 2023


#### Retrieve missed papers from SIAMNUM 2009

In [18]:
get_siamnum_2009_papers = False

if get_siamnum_2009_papers:
    links = [{"url":"https://dblp.org/db/journals/siamnum/siamnum47.html",
             "text":"Volume 47: 2008-2010"}]
    get_journal_papers_from_dblp("siamnum", 2009, force=True, save=True, links=links, alt=False)

## Explore the paper counts for the venues

In [19]:
data["filterediszero"] = data["filtered publications"] ==0
data_agg = data.groupby("name").sum()
data_agg[data_agg["iszero"]!=data_agg["filterediszero"]]

Unnamed: 0_level_0,year,total publications,filtered publications,iszero,filterediszero
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [24]:
data_agg["removed"] = data_agg["total publications"] - data_agg["filtered publications"]
data_agg["removed pc"] = data_agg["removed"] / data_agg["total publications"]

In [25]:
data_agg.sort_values("removed pc", ascending=False)

Unnamed: 0_level_0,year,total publications,filtered publications,iszero,filterediszero,removed,removed pc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
isscc,34255,4028,285,0,0,3743,0.929245
sensys,34255,1471,474,0,0,997,0.677770
sigmetrics,34255,895,333,6,6,562,0.627933
vr,34255,2137,828,0,0,1309,0.612541
sigcse,34255,4829,2031,0,0,2798,0.579416
...,...,...,...,...,...,...,...
lctes,34255,0,0,17,17,0,
sea,34255,0,0,17,17,0,
sigsoft-fse,34255,0,0,17,17,0,
ssd,34255,0,0,17,17,0,


In [21]:
data.sum()

name                     3dim3dim3dim3dim3dim3dim3dim3dim3dim3dim3dim3d...
year                                                               7955376
total publications                                                  389797
filtered publications                                               339809
iszero                                                                 430
filterediszero                                                         430
dtype: object