In [1]:
 import pandas as pd
import numpy as np

from utils import read_csv
from config import DATA_PATH
from temp import aff2country, aff2city
# from pipeline import dblp_paper_df, asn_paper_df, author_df, coauthor_df

In [2]:
def overview(paper_df, by='country', freq=1, is_a=False):
    """ Data for the annual publication overview on the certain scale.

    Args:
        - paper_df: pd.Dataframe
            paper data, requires column of id, year & affiliation
        - by: string, 'country' or 'city'
            dimension where we calculate the annual publication num
        - freq: int
            indicates the threshold of num_cites of the paper
        - is_a: Boolean
            whether choose only journal A

    Returns:
        processed Dataframe, grouped by year & country/city

    """
    if freq > 1:
        paper_df = paper_df[paper_df['n_cites'] >= freq]
    if is_a:
        paper_df = paper_df[paper_df['is_a'] == 1]
    df = paper_df[['id', 'year', 'affiliation']].copy()
    if by == 'country':
        df[by] = df['affiliation'].apply(aff2country)
    elif by == 'city':
        df[by] = df['affiliation'].apply(aff2city)
    df = df.dropna(subset=[by])
    grouped_paper = df.groupby(["year", by])
    df1 = pd.DataFrame()
    count = grouped_paper["id"].apply(len)
    df1["year"] = [idx[0] for idx in count.axes[0]]
    df1[by] = [idx[1] for idx in count.axes[0]]
    df1["publication count"] = count.values
    return df1

In [12]:
dblp_paper_df = read_csv(DATA_PATH, 'dblp_paper.csv')
paper_df = dblp_paper_df.dropna(subset=['authors_org']).copy().reset_index(drop=True)
paper_df['affiliation'] = paper_df['authors_org'].apply(lambda s: s.split(';')[0] if s else None)
paper_df = paper_df.copy()
global_overview_df = overview(paper_df)

In [10]:
dblp_paper_df['authors_org'].isna().sum()

5107

In [23]:
def annual_growth_rate(pub_df, start=2000, by='country'):
    """ Calculate the annual growth rate of publication by annual publication

    To select frequently-cited or A papers,
    Args:
        - pub_df: pd.Dataframe
            publication data, generated by overview()
        - start: int, default 2000
            start date for calculating growth rate
        - by: string, 'country' or 'city'
            dimension where we calculate the annual publication growth rate

    Returns:
        processed Dataframe, grouped by year & country in format percentage
        if the region had no publication last year, growth rate is np.nan

    """
    gr_df = pd.DataFrame()
    for year in range(start, max(pub_df['year']) + 1):
        for region in pd.unique(pub_df[by]):
            region_df = pub_df[pub_df[by] == region].copy().reset_index(drop=True)
            l = region_df[region_df['year'] == year - 1]['publication count'].values
            t = region_df[region_df['year'] == year]['publication count'].values
            if l.size > 0:
                if t.size > 0:
                    gr = 100. * (t[0] - l[0]) / l[0]
                else:
                    gr = -100.
            else:
                if t.size > 0:
                    gr = np.nan
                else:
                    gr = 0.
            gr_df = gr_df.append({'year': year, by: region, 'growth rate': gr}, ignore_index=True)
    gr_df['year'] = gr_df['year'].astype('int')
    return gr_df

In [24]:
growth_rate_df = annual_growth_rate(global_overview_df)