In [None]:
import math
import pickle
import numpy as np
from scipy.stats import beta
from collections import defaultdict

from Content import *
from Venue import *

## Citation Edge Likelihood

In [None]:
## Year 2000 ~ 2018 (inclusive)
## PA * Latest * Field * Self = 2*2*2*2 = 16
## PA: Normal PA (p=0.1, p/#nodes + (1-p)indegree/sum_of_indegree), Uniform (1/#nodes)
## Latest: Normal Latest (beta(10,1), x=1-(outyear-inyear)/(outyear-oldest_year)), Uniform (1/(outyear-oldest_year))
## Field: Similar (1-(1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized), Different ((1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized)
## Self: Prefer (coauthors: 0.9/#coauthors, non-coauthors: 0.1/#non-coauthors), Not Prefer (coauthors: 0.1/#coauthors, non-coauthors: 0.9/#non-coauthors)

## 1st: Normal_PA * Normal_Latest * Similar_Field * Prefer_Self
## 2nd: Normal_PA * Normal_Latest * Similar_Field * NotPrefer_Self
## 3rd: Normal_PA * Normal_Latest * Different_Field * Prefer_Self
## 4th: Normal_PA * Normal_Latest * Different_Field * NotPrefer_Self
## 5th: Normal_PA * Uniform_Latest * Similar_Field * Prefer_Self
## 6th: Normal_PA * Uniform_Latest * Similar_Field * NotPrefer_Self
## 7th: Normal_PA * Uniform_Latest * Different_Field * Prefer_Self
## 8th: Normal_PA * Uniform_Latest * Different_Field * NotPrefer_Self
## 9th: Uniform_PA * Normal_Latest * Similar_Field * Prefer_Self
## 10th: Uniform_PA * Normal_Latest * Similar_Field * NotPrefer_Self
## 11th: Uniform_PA * Normal_Latest * Different_Field * Prefer_Self
## 12th: Uniform_PA * Normal_Latest * Different_Field * NotPrefer_Self
## 13th: Uniform_PA * Uniform_Latest * Similar_Field * Prefer_Self
## 14th: Uniform_PA * Uniform_Latest * Similar_Field * NotPrefer_Self
## 15th: Uniform_PA * Uniform_Latest * Different_Field * Prefer_Self
## 16th: Uniform_PA * Uniform_Latest * Different_Field * NotPrefer_Self

In [None]:
content_sumdgs, content_eachdgs, content_fields, content_cumcounts = defaultdict(np.int64), defaultdict(dict), {}, defaultdict(np.int64)

for year in range(1980, 2019):
    contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    for content in contents.values():
        content_fields[content.id] = content.field/np.linalg.norm(content.field,2)
        incitation_count = 0
        for in_year in range(1980, 2019):
            if in_year in content.incitations.keys():
                incitation_count += len(content.incitations[in_year])
            if in_year >= 1999:
                content_sumdgs[in_year] += incitation_count
                content_eachdgs[content.id][in_year] = incitation_count
                if in_year >= content.year:
                    content_cumcounts[in_year] += 1                
    print(f"Prepare info done for year {year}")
    
pickle.dump((content_sumdgs, content_eachdgs, content_fields, content_cumcounts), open('content/content_prob_inputs.pkl', 'wb'), -1)

In [None]:
content_year = pickle.load(open('content/content_year.pkl','rb'))
content_authors =  pickle.load(open('content/content_authors.pkl','rb'))
author_contents =  pickle.load(open('author/author_contents.pkl','rb'))
content_sumdgs, content_eachdgs, content_fields, content_cumcounts = pickle.load(open('content/content_prob_inputs.pkl', 'rb'))

oldest = 1979
superbeta = beta(a=10,b=1)
superbeta_dist = np.array([superbeta.pdf((intime-oldest)/(outtime-oldest)) for outtime in range(2000,2019) for intime in range(1980,2018)]).reshape(2019-2000,2018-1980)

def cal_cite_edgeprobs(outcontent):
    
    outyear, outfield, outcitations = outcontent.year, content_fields[outcontent.id], outcontent.outcitations
    outcocontents = set()
    for author,_,_ in outcontent.authors:
        for year in author_contents[author].keys():
            if year < outyear:
                for outcocontent in author_contents[author][year]:
                    outcocontents.add(outcocontent)

    edgeprobs = []
    for incontent in outcitations:
         
        pnormal_pa = 0.1/content_cumcounts[outyear-1]+0.9*content_eachdgs[incontent][outyear-1]/content_sumdgs[outyear-1]
        puniform_pa = 1/content_cumcounts[outyear-1]

        pnormal_latest = superbeta_dist[outyear-2000, content_year[incontent]-1980]
        puniform_latest = 1/(outyear-oldest)

        psim_field = 1-(1-math.exp(-np.linalg.norm(outfield-content_fields[incontent],2)))/(1-math.exp(-2))
        pdif_field = 1-psim_field
        
        ppre_self = 0.9/len(outcocontents) if incontent in outcocontents else 0.1/(content_cumcounts[outyear-1]-len(outcocontents))
        pnot_self = 0.1/len(outcocontents) if incontent in outcocontents else 0.9/(content_cumcounts[outyear-1]-len(outcocontents))
        
        temp1 = np.outer([pnormal_pa,puniform_pa],[pnormal_latest,puniform_latest]).flatten()
        temp2 = np.outer([psim_field,pdif_field],[ppre_self,pnot_self]).flatten()
        edgeprobs.append(np.outer(temp1,temp2).flatten())
    
    return np.array(edgeprobs, dtype=np.float32)

In [None]:
for year in range(2000, 2019):
    print('Reading year {}'.format(year))
    contents =  pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    print('Edgeprob-calculation start for year {}'.format(year))
    for content in contents.values():
        if len(content.outcitations)>0:
            content.update_cite_edgellhs(cal_cite_edgeprobs(content))
    print('Writing year {}'.format(year))
    pickle.dump(contents, open('content/contents_{}.pkl'.format(year), 'wb'), -1)
    del contents
    print('Edgeprob-calculation done for year {}'.format(year))

## Location Edge Likelihood

In [None]:
## Year 2000 ~ 2018 (inclusive)
## PA * Field * Self = 2*2*2 = 8
## PA: Normal PA (p=0.1, p/#nodes + (1-p)indegree/sum_of_indegree), Uniform (1/#nodes)
## Field: Similar (1-(1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized), Different ((1-e^(-||x-y||_2))/(1-e^(-2)), x&y L2-normalized)
## Self: Prefer (coauthors: 0.9/#coauthors, non-coauthors: 0.1/#non-coauthors), Not Prefer (coauthors: 0.1/#coauthors, non-coauthors: 0.9/#non-coauthors)

## 1st: Normal_PA * Similar_Field * Prefer_Self
## 2nd: Normal_PA * Similar_Field * NotPrefer_Self
## 3rd: Normal_PA * Different_Field * Prefer_Self
## 4th: Normal_PA * Different_Field * NotPrefer_Self
## 5th: Uniform_PA * Similar_Field * Prefer_Self
## 6th: Uniform_PA * Similar_Field * NotPrefer_Self
## 7th: Uniform_PA * Different_Field * Prefer_Self
## 8th: Uniform_PA * Different_Field * NotPrefer_Self

In [None]:
venues = pickle.load(open('venue/venues.pkl','rb'))
venue_sumdgs, venue_eachdgs = defaultdict(np.int64), defaultdict(dict)

for venue in venues.values():
    incitation_count = 0
    for in_year in range(1980, 2019):
        if in_year in venue.contents.keys():
            incitation_count += len(venue.contents[in_year])
        if in_year >= 1999:
            venue_sumdgs[in_year] += incitation_count
            venue_eachdgs[venue.id][in_year] = incitation_count             
    
pickle.dump((venue_sumdgs, venue_eachdgs), open('venue/venue_content_prob_inputs.pkl', 'wb'), -1)

In [None]:
content_authors =  pickle.load(open('content/content_authors.pkl','rb'))
author_venues =  pickle.load(open('author/author_venues.pkl','rb'))
venue_fields = pickle.load(open('venue/venue_fields.pkl', 'rb'))
venue_cumcounts = pickle.load(open('venue/venue_cumcounts.pkl', 'rb'))
venue_sumdgs, venue_eachdgs = pickle.load(open('venue/venue_content_prob_inputs.pkl', 'rb'))

def cal_pub_edgeprobs(outcontent):
    
    outyear, outfield, invenue = outcontent.year, content.field/np.linalg.norm(content.field,2), outcontent.venue
    outcovenues = set()
    for author,_,_ in outcontent.authors:
        for year in author_venues[author].keys():
            if year < outyear:
                for outcovenue in author_venues[author][year]:
                    outcovenues.add(outcovenue)    
         
    pnormal_pa = 0.1/venue_cumcounts[outyear-1]+0.9*venue_eachdgs[invenue][outyear-1]/venue_sumdgs[outyear-1]
    puniform_pa = 1/venue_cumcounts[outyear-1]

    psim_field = 1-(1-math.exp(-np.linalg.norm(outfield-venue_fields[invenue][outyear],2)))/(1-math.exp(-2))
    pdif_field = 1-psim_field
        
    ppre_self = 0.9/len(outcovenues) if invenue in outcovenues else 0.1/(venue_cumcounts[outyear-1]-len(outcovenues))
    pnot_self = 0.1/len(outcovenues) if invenue in outcovenues else 0.9/(venue_cumcounts[outyear-1]-len(outcovenues))
        
    temp1 = np.array([pnormal_pa,puniform_pa])
    temp2 = np.outer([psim_field,pdif_field],[ppre_self,pnot_self]).flatten()
    edgeprob = np.outer(temp1,temp2).flatten()

    return np.array(edgeprob, dtype=np.float32).reshape(1,-1)

In [None]:
for year in range(2000, 2019):
    print('Reading year {}'.format(year))
    contents =  pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    print('Edgeprob-calculation start for year {}'.format(year))
    for content in contents.values():
        if len(content.outcitations)>0:
            content.update_pub_edgellhs(cal_pub_edgeprobs(content))
    print('Writing year {}'.format(year))
    pickle.dump(contents, open('content/contents_{}.pkl'.format(year), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    del contents
    print('Edgeprob-calculation done for year {}'.format(year))