#### DBLP Publication Data Filtering for Analysis

Using the DBLP spreadsheet created from dblp_mapping.ipynb, this notebook will filter the spreadsheet to obtain a table of publications matching a specific criteria depending on use case. Visualization can be produced with Python libraries but external BI software may be able to produce better dashboards and visualizations.

In [None]:
# libs and functions for data processing
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter, defaultdict
import ast
from tqdm import tqdm
import heapq

# create list of years based on input start and end years
def year_list(start_year:int, end_year:int):
    return [year for year in range(start_year, end_year + 1)]

# look for target inst/country/region
def contains_target(s, targets):
    return not s.isdisjoint(targets)

# category count by conference
def count_by_conf(data:pd.DataFrame):
    return data['Conference'].value_counts()

# category count by year
def count_by_year(data:pd.DataFrame):
    return data['Year'].value_counts()

# number of matching publications
def count_total_publ(data:pd.DataFrame):
    return len(data)

# filter functions
def filter_by_year(data:pd.DataFrame,target:list):
    data = data[data['Year'].isin(target)]
    return data

def filter_by_conf(data:pd.DataFrame,target:list):
    data = data[data['Conference'].isin(target)]
    return data

def filter_by_inst(data:pd.DataFrame,target:list):
    data = data[data['Affiliations'].apply(contains_target, args=(target,))]
    return data

def filter_by_country(data:pd.DataFrame,target:list):
    data = data[data['Countries'].apply(contains_target, args=(target,))]
    return data

def filter_by_region(data:pd.DataFrame,target:list):
    data = data[data['Region'].apply(contains_target, args=(target,))]
    return data

# apply filter functions based on input parameters
def search(data:pd.DataFrame,
           lookup_years:list,
           lookup_conf:list,
           lookup_country:list,
           lookup_region:list,
           lookup_inst:list):
    
    # filter by year
    if lookup_years:
        data = filter_by_year(data,lookup_years)

    # filter by conference
    if lookup_conf:
        data = filter_by_conf(data,lookup_conf)

    # 3 choose 1
    if lookup_country:
        data = filter_by_country(data,lookup_country)
    elif lookup_region:
        data = filter_by_region(data,lookup_region)
    elif lookup_inst:
        data = filter_by_inst(data,lookup_inst)

    return data

# determine top N institutions from the filtered dataset
def get_topN(top_n:int,data:pd.DataFrame):
    publ_counts = Counter()

    for row in data['Affiliations']:
        publ_counts.update(row)

    return heapq.nlargest(top_n, publ_counts, key=publ_counts.get)

# get ranking of particular institution
def get_ranking(data:pd.DataFrame,inst:str):
    df = data.explode('Affiliations')
    val_counts = df['Affiliations'].value_counts()
    return val_counts.index.get_loc(inst)+1

In [None]:
# read and format csv data
publications = pd.read_csv('DBLP_publications.csv')
publications['Affiliations'] = publications['Affiliations'].apply(ast.literal_eval)
publications['Countries'] = publications['Countries'].apply(ast.literal_eval)
publications['Region'] = publications['Region'].apply(ast.literal_eval)

#### Filters defined below

In [None]:
# filter parameters
lookup_years = year_list(2014,2024) # filter by year
#filter by conference, these represent the top conferences in AI
lookup_conf = ['aaai','acl','cvpr','eccv','emnlp','iccv','iclr',
           'icml','icra','ijcai','iros','naacl','nips','rss']

lookup_country = [] # filter by country
lookup_region = [] # filter by region
lookup_inst = ['MBZUAI'] # filter by institution

#### Top N Ranking (Global)

In [None]:
world_top = get_topN(10,search(publications,lookup_years,lookup_conf,lookup_country,lookup_region,lookup_inst))

global_result = search(publications,lookup_years,lookup_conf,lookup_country,lookup_region,world_top)

counts = defaultdict(Counter)

for _, row in global_result.iterrows():
    year = row['Year']
    for value in world_top:
        if value in row['Affiliations']:
            counts[year][value] += 1

filtered_counts = {year: {value: counts[year][value] for value in world_top} for year in counts}
dict(sorted(filtered_counts.items()))

#### Ranking of Particular Institution

#### Institution Publications Sorted by Conference in Defined Year Range

#### Institution Publications Sorted by Year