In [1]:
#Author Alex J. Yang, alexjieyang@outlook.com
import numpy as np
import pandas as pd
import itertools
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from tqdm import tqdm

def bootstrapped(data,n_resamples = 50):
    data = (data,)
    bootstrap_ci = scipy.stats.bootstrap(data, np.mean, confidence_level=0.95,n_resamples=n_resamples,
                         random_state=1, method='percentile')
    return bootstrap_ci.confidence_interval

In [2]:
import gc

# switch

In [4]:
pub2author = pd.read_csv(r'E:\NETDATA\SciSciNet\1950-2020\Author_Paper_1stlast.csv.zip' , usecols=['AuthorID','PaperID','Year'])
pub2author.columns = ['PublicationId','AuthorId','Year']

In [7]:
pub2ref = pd.read_csv(r'E:\Data\SciSciNet\SciSciNet_Paper_References.csv.zip')
pub2ref.columns = ['CitingPublicationId', 'CitedPublicationId']

In [8]:
pub2ref = pub2ref[pub2ref['CitingPublicationId'].isin(pub2author['PublicationId'].values)]
pub2ref.dropna(inplace = True)

In [9]:
pub2j = pd.read_csv(r'E:\Data\SciSciNet\SciSciNet_Papers.zip' , sep='\t', usecols=['PaperID','JournalID'])
pub2ref['CitedJournalId'] = pub2ref['CitedPublicationId'].map(pub2j.set_index('PaperID')['JournalID'])
del pub2j
pub2ref.dropna(inplace = True)
gc.collect()

583

In [14]:
previous_k=None
year_window=3
pub2refjournalcounts = groupby_count(pub2ref, ['CitingPublicationId', 'CitedJournalId'], 
                                          'CitedPublicationId', count_unique=True)
pub2refjournalcounts.rename(columns={'CitedPublicationIdCount':'CitedJournalCount'}, inplace=True)

pub2refjournalcounts.columns = ['PublicationId', 'CitedJournalId', 'CitedJournalCount']

In [16]:
pa_refs = pd.merge(pub2author, pub2refjournalcounts, how='left', on = 'PublicationId')

In [18]:
# pa_refs.dropna(inplace=True)
# pa_refs['CitedJournalId'] = pa_refs['CitedJournalId'].astype(int)
pa_refs.sort_values(by=['AuthorId', 'Year', 'PublicationId', 'CitedJournalId'], inplace=True)
pa_refs.reset_index(drop=True, inplace=True)

In [20]:
pscore = pa_refs.groupby('AuthorId').apply(author_switch).reset_index()
del pscore['level_1']

## switch Functions

In [10]:
def author_switch(authordf):
    pubgroups = authordf.groupby('PublicationId', sort=False)
    allpubidx = None
    if not previous_k is None:
        allpubidx = changepoint(authordf['PublicationId'].values)
    switchresults = []
    def publication_switch(pubgroup):
        pubidx = pubgroup.index[0]
        pid = pubgroup.name
        if pubidx==0: switchresults.append([pid, None])
        else:
            i=len(switchresults)
            if not previous_k is None and i > previous_k:
                history = authordf.iloc[allpubidx[i-previous_k]:pubidx]
            else:
                history = authordf.iloc[:pubidx]
            if not year_window is None:
                history = history[history['Year'] >= pubgroup['Year'].values[0] - year_window]
            if history.shape[0] > 0:
                history = history.groupby('CitedJournalId', sort=False, as_index=False)['CitedJournalCount'].sum()
                cosine = pandas_cosine_similarity(history, pubgroup, col_keys='CitedJournalId', col_values='CitedJournalCount')
                switchresults.append([pid, cosine])
            else:
                switchresults.append([pid, None])
    pubgroups.apply(publication_switch)
    return pd.DataFrame(switchresults, columns=['PublicationId', 'switchScore'])

In [11]:
def groupby_count(df, colgroupby, colcountby, count_unique=True):
    newname_dict = zip2dict([str(colcountby), '0'], [str(colcountby)+'Count']*2)
    if count_unique:
        count = df.groupby(colgroupby, sort=False, as_index=False)[colcountby].nunique()
    else:
        count = df.groupby(colgroupby, sort=False, as_index=False)[colcountby].count()
    return count.rename(columns=newname_dict)

In [12]:
def zip2dict(keys, values):
    return dict(zip(keys, values))

def changepoint(a):
    return np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [a.shape[0]]])

In [13]:
def pandas_cosine_similarity(df1, df2, col_keys, col_values):
    search_index = np.searchsorted(df2[col_keys].values, df1[col_keys].values, side = 'left')
    search_index[search_index >= df2.shape[0]] = df2.shape[0] - 1
    isin = df1[col_keys].values == df2[col_keys].values[search_index]
    cosine_num = np.inner(df1[col_values].values[isin], df2[col_values].values[search_index[isin]])
    cosine_denom = np.linalg.norm(df1[col_values].values) * np.linalg.norm(df2[col_values].values)
    return 1.0 - cosine_num/cosine_denom