In [None]:
import math
import pickle
import numpy as np
from scipy.stats import beta
from collections import defaultdict

from Author import *
from Venue import *

In [None]:
group_names = [2000, 2013, 201700, 201701, 201800, 201801, 
               201900, 201901, 201902, 201903, 201904, 201905, 
               201906, 201907, 201908, 201909, 201910, 201911, 
               201912, 201913, 201914]

## Prepare Info

In [None]:
## Year 2000 ~ 2018 (inclusive)
## PA * Latest * Field * Self = 2*2*2*2 = 16
## S_{1,i} -> PA: Normal PA (p=0.1, p/#nodes + (1-p)indegree/sum_of_indegree), Uniform (1/#nodes)
## S_{4,i} -> Latest: Normal Latest (beta(10,1), x=1-(outyear-inyear)/(outyear-oldest_year)), Uniform (1/(outyear-oldest_year))
## S_{2,i} -> Field: Similar (1-(1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized), Different ((1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized)
## S_{3,i} -> Self: Prefer (coauthors: 0.9/#coauthors, non-coauthors: 0.1/#non-coauthors), Not Prefer (coauthors: 0.1/#coauthors, non-coauthors: 0.9/#non-coauthors)

## 1st: Normal_PA * Normal_Latest * Similar_Field * Prefer_Self
## 2nd: Normal_PA * Normal_Latest * Similar_Field * NotPrefer_Self
## 3rd: Normal_PA * Normal_Latest * Different_Field * Prefer_Self
## 4th: Normal_PA * Normal_Latest * Different_Field * NotPrefer_Self
## 5th: Normal_PA * Uniform_Latest * Similar_Field * Prefer_Self
## 6th: Normal_PA * Uniform_Latest * Similar_Field * NotPrefer_Self
## 7th: Normal_PA * Uniform_Latest * Different_Field * Prefer_Self
## 8th: Normal_PA * Uniform_Latest * Different_Field * NotPrefer_Self
## 9th: Uniform_PA * Normal_Latest * Similar_Field * Prefer_Self
## 10th: Uniform_PA * Normal_Latest * Similar_Field * NotPrefer_Self
## 11th: Uniform_PA * Normal_Latest * Different_Field * Prefer_Self
## 12th: Uniform_PA * Normal_Latest * Different_Field * NotPrefer_Self
## 13th: Uniform_PA * Uniform_Latest * Similar_Field * Prefer_Self
## 14th: Uniform_PA * Uniform_Latest * Similar_Field * NotPrefer_Self
## 15th: Uniform_PA * Uniform_Latest * Different_Field * Prefer_Self
## 16th: Uniform_PA * Uniform_Latest * Different_Field * NotPrefer_Self

In [None]:
## Year 2000 ~ 2018 (inclusive)
## PA * Field * Self = 2*2*2 = 8
## S_{1,i} -> PA: Normal PA (p=0.1, p/#nodes + (1-p)indegree/sum_of_indegree), Uniform (1/#nodes)
## S_{2,i} -> Field: Similar (1-(1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized), Different ((1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized)
## S_{3,i} -> Self: Prefer (coauthors: 0.9/#coauthors, non-coauthors: 0.1/#non-coauthors), Not Prefer (coauthors: 0.1/#coauthors, non-coauthors: 0.9/#non-coauthors)

## 1st: Normal_PA * Similar_Field * Prefer_Self
## 2nd: Normal_PA * Similar_Field * NotPrefer_Self
## 3rd: Normal_PA * Different_Field * Prefer_Self
## 4th: Normal_PA * Different_Field * NotPrefer_Self
## 5th: Uniform_PA * Similar_Field * Prefer_Self
## 6th: Uniform_PA * Similar_Field * NotPrefer_Self
## 7th: Uniform_PA * Different_Field * Prefer_Self
## 8th: Uniform_PA * Different_Field * NotPrefer_Self

In [None]:
content_authors =  pickle.load(open('content/content_authors.pkl','rb'))

author_fields, author_coauthors = defaultdict(dict), defaultdict(dict)
author_sumdgs, author_eachdgs, author_cumcounts = defaultdict(np.int64), defaultdict(dict), defaultdict(np.int64)

for group_name in group_names:
    authors = pickle.load(open('author/authors_{}.pkl'.format(group_name),'rb'))    
    for author in authors.values():
        
        for year, field in author.fields.items():
            author_fields[author.id][year] = field/np.linalg.norm(field,2)
        
        for year, contents in author.contents.items():
            year_coauthors = [content_authors[content] for content in contents]
            year_coauthors = [coauthor for coauthors in year_coauthors for coauthor in coauthors]
            author_coauthors[author.id][year] = set(year_coauthors)
        
        incitation_count = 0
        for in_year in range(1980, 2019):
            if in_year in author.incitations.keys():
                incitation_count += len(author.incitations[in_year])
            if in_year >= 1999:
                author_sumdgs[in_year] += incitation_count
                author_eachdgs[author.id][in_year] = incitation_count
                
        for year in range(min(author.contents.keys()), 2019):
            author_cumcounts[year] += 1
            
    print(f"Prepare info done for group {group_name}")

pickle.dump(author_fields, open('author/author_fields.pkl', 'wb'), -1)
pickle.dump(author_coauthors, open('author/author_coauthors.pkl', 'wb'), -1)
pickle.dump((author_sumdgs, author_eachdgs, author_cumcounts), open('author/author_prob_inputs.pkl', 'wb'), -1)

In [None]:
venues = pickle.load(open('venue/venues.pkl','rb'))
venue_sumdgs, venue_eachdgs = defaultdict(np.int64), defaultdict(dict)

for venue in venues.values():
    incitation_count = 0
    for in_year in range(1980, 2019):
        if in_year in venue.contents.keys():
            for count in venue.authors[in_year].values():
                incitation_count += count
        if in_year >= 1999:
            venue_sumdgs[in_year] += incitation_count
            venue_eachdgs[venue.id][in_year] = incitation_count             
    
pickle.dump((venue_sumdgs, venue_eachdgs), open('venue/venue_author_prob_inputs.pkl', 'wb'), -1)

## Edge Likelihood

In [None]:
author_fields = pickle.load(open('author/author_fields.pkl', 'rb'))
author_coauthors = pickle.load(open('author/author_coauthors.pkl', 'rb'))
author_sumdgs, author_eachdgs, author_cumcounts = pickle.load(open('author/author_prob_inputs.pkl', 'rb'))

oldest = 1979
superbeta = beta(a=10,b=1)
superbeta_dist = np.array([superbeta.pdf((intime-oldest)/(outtime-oldest)) for outtime in range(2000,2019) for intime in range(1980,2018)]).reshape(2019-2000,2018-1980)

def cal_cite_edgeprobs(outyear, author):
    
    outfield, outcitations = author_fields[author.id][outyear], author.outcitations[outyear]
    coauthors = set()
    for year in sorted(author_coauthors[author.id]):
        if year>=outyear: break
        for coauthor in author_coauthors[author.id][year]:
            coauthors.add(coauthor)
    
    edgeprobs = []
    for inyear, inauthor in outcitations:
        
        pnormal_pa = 0.1/author_cumcounts[outyear-1]+0.9*author_eachdgs[inauthor][outyear-1]/author_sumdgs[outyear-1]
        puniform_pa = 1/author_cumcounts[outyear-1]
        
        pnormal_latest = superbeta_dist[outyear-2000, inyear-1980]
        puniform_latest = 1/(outyear-oldest)
        
        psim_field = 1-(1-math.exp(-np.linalg.norm(outfield-author_fields[inauthor][inyear],2)))/(1-math.exp(-2))
        pdif_field = 1-psim_field
        
        ppre_self = 0.9/len(coauthors) if inauthor in coauthors else 0.1/(author_cumcounts[outyear-1]-len(coauthors))
        pnot_self = 0.1/len(coauthors) if inauthor in coauthors else 0.9/(author_cumcounts[outyear-1]-len(coauthors))
        
        temp1 = np.outer([pnormal_pa,puniform_pa],[pnormal_latest,puniform_latest]).flatten()
        temp2 = np.outer([psim_field,pdif_field],[ppre_self,pnot_self]).flatten()
        edgeprobs.append(np.outer(temp1,temp2).flatten())

    return np.array(edgeprobs, dtype=np.float32)

In [None]:
author_fields = pickle.load(open('author/author_fields.pkl', 'rb'))
venue_fields = pickle.load(open('venue/venue_fields.pkl', 'rb'))
venue_cumcounts = pickle.load(open('venue/venue_cumcounts.pkl', 'rb'))
venue_sumdgs, venue_eachdgs = pickle.load(open('venue/venue_author_prob_inputs.pkl', 'rb'))

def cal_pub_edgeprobs(outyear, author):    
    
    outfield = author_fields[author.id][outyear]
    covenues = set()
    for year in sorted(author.venues.keys()):
        if year>=outyear: break
        for venue in author.venues[year]:
            covenues.add(venue)
    
    edgeprobs = []
    for invenue in author.venues[outyear]:
    
        pnormal_pa = 0.1/venue_cumcounts[outyear-1]+0.9*venue_eachdgs[invenue][outyear-1]/venue_sumdgs[outyear-1]
        puniform_pa = 1/venue_cumcounts[outyear-1]
        
        psim_field = 1-(1-math.exp(-np.linalg.norm(outfield-venue_fields[invenue][outyear],2)))/(1-math.exp(-2))
        pdif_field = 1-psim_field
        
        ppre_self = 0.9/len(covenues) if invenue in covenues else 0.1/(venue_cumcounts[outyear-1]-len(covenues))
        pnot_self = 0.1/len(covenues) if invenue in covenues else 0.9/(venue_cumcounts[outyear-1]-len(covenues))
        
        temp1 = np.array([pnormal_pa,puniform_pa])
        temp2 = np.outer([psim_field,pdif_field],[ppre_self,pnot_self]).flatten()
        edgeprobs.append(np.outer(temp1,temp2).flatten())

    return np.array(edgeprobs, dtype=np.float32)

In [None]:
for group_name in group_names[1:]:
    print('Reading group {}'.format(group_name))
    authors = pickle.load(open('author/authors_{}.pkl'.format(group_name),'rb'))
    print('Edgeprob-calculation start for group {}, size {}'.format(group_name, len(authors)))
    for i, author in enumerate(authors.values()):
        for outyear in author.outcitations.keys():
            if outyear >= 2000:
                author.update_cite_edgellhs(outyear, cal_cite_edgeprobs(outyear, author))
                author.update_pub_edgellhs(outyear, cal_pub_edgeprobs(outyear, author))
        if (i+1)%10000==0: print('Finish', i+1)
    print('Writing group {}'.format(group_name))
    pickle.dump(authors, open('author/authors_{}.pkl'.format(group_name), 'wb'), -1)
    del authors
    print('Edgeprob-calculation done for group {}'.format(group_name))
    print()