In [None]:
import pickle
import numpy as np
from collections import defaultdict

from Author import *
from Content import *

## Content Input

In [None]:
c_actives = defaultdict(list)
c_positions = defaultdict(dict)
ca_adjs = defaultdict(dict)
c_embeddings = defaultdict(dict)
c_cite_edgellhs = defaultdict(dict)
c_pub_edgellhs = defaultdict(dict)

In [None]:
for year in range(2000, 2019):
    
    print('Read year {}'.format(year))
    contents =  pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    
    print('Start collection on year {}'.format(year))
    for content in contents.values():
        if len(content.outcitations)>0:
            c_actives[year].append(content.id)
            ca_adjs[year][content.id] = [author_id for author_id,_,_ in content.authors]
            c_embeddings[year][content.id] = content.embedding
            c_cite_edgellhs[year][content.id] = content.cite_edgellhs
            c_pub_edgellhs[year][content.id] = content.pub_edgellhs
        
    del contents
    print('Finish collection on year {}'.format(year))
    print()

In [None]:
for year in range(2000, 2019):    
    
    c_actives[year] = np.sort(c_actives[year])
    c_positions[year] = dict((c,i) for i,c in enumerate(c_actives[year]))
    ca_adjs[year] = np.array([[k,v] for k in c_actives[year] for v in np.sort(ca_adjs[year][k])]).T
    c_embeddings[year] = np.array([c_embeddings[year][c_id] for c_id in c_actives[year]])
    c_cite_edgellhs[year] = [c_cite_edgellhs[year][c_id] for c_id in c_actives[year]]
    c_pub_edgellhs[year] = [c_pub_edgellhs[year][c_id] for c_id in c_actives[year]]
    
    pickle.dump((c_actives[year], c_positions[year], ca_adjs[year], c_embeddings[year], c_cite_edgellhs[year]), open(f'cite_input/c_cite_inputs_{year}','wb'), -1)
    pickle.dump((c_actives[year], c_positions[year], ca_adjs[year], c_embeddings[year], c_pub_edgellhs[year]), open(f'pub_input/c_pub_inputs_{year}','wb'), -1)
    
    print(f'Finish year {year} node size {len(c_actives[year])} edge size {len(ca_adjs[year][0])}')
    
pickle.dump(c_actives, open('content/c_actives.pkl','wb'),-1)
pickle.dump(ca_adjs, open('content/active_ca_adjs.pkl','wb'),-1)

## Author Input

In [None]:
group_names = [2000, 2013, 201700, 201701, 201800, 201801, 
               201900, 201901, 201902, 201903, 201904, 201905, 
               201906, 201907, 201908, 201909, 201910, 201911, 
               201912, 201913, 201914]

In [None]:
name_id = defaultdict(list)
id_name = {}

a_actives = defaultdict(list)
a_positions = defaultdict(dict)
ac_adjs = defaultdict(dict)
a_embeddings = defaultdict(dict)
a_cite_edgellhs = defaultdict(dict)
a_pub_edgellhs = defaultdict(dict)

a_latest_cite_dists = {}
a_latest_pub_dists = {}
a_latest_embeddings = defaultdict(dict)

In [None]:
np.random.seed(1)
c_actives = pickle.load(open('content/c_actives.pkl','rb'))

for group_name in group_names[1:]:    
    print('Reading group {}'.format(group_name))
    authors = pickle.load(open('author/authors_{}.pkl'.format(group_name),'rb'))
    
    print('Start collection for group {}'.format(group_name))
    for author in authors.values():
        
        name_id[author.name].append(author.id)
        id_name[author.id] = author.name
        a_latest_cite_dists[author.id] = np.random.dirichlet([1]*16)
        a_latest_pub_dists[author.id] = np.random.dirichlet([1]*8)
        
        latest_embedding = np.zeros(50)
        for year in sorted(author.outcitations):
            if year<2000: continue
                
            a_actives[year].append(author.id)            
            ac_adjs[year][author.id] = [content for content in author.contents[year] if content in c_actives[year]]
            a_embeddings[year][author.id] = author.embeddings[year]  
            a_cite_edgellhs[year][author.id] = author.cite_edgellhs[year]
            a_pub_edgellhs[year][author.id] = author.pub_edgellhs[year]
            
            a_latest_embeddings[year][author.id] = latest_embedding
            latest_embedding = author.embeddings[year]            

    del authors    
    print('Finish collection for group {}'.format(group_name))
    print()

In [None]:
pickle.dump(name_id, open('author/name_id.pkl', 'wb'), -1)
pickle.dump(id_name, open('author/id_name.pkl', 'wb'), -1)
pickle.dump(a_latest_cite_dists, open('cite_input/a_latest_cite_dists_1999.pkl', 'wb'), -1)
pickle.dump(a_latest_pub_dists, open('pub_input/a_latest_pub_dists_1999.pkl', 'wb'), -1)

for year in range(2000, 2019):
    
    a_actives[year] = np.sort(a_actives[year])
    a_positions[year] = dict((a,i) for i,a in enumerate(a_actives[year]))
    ac_adjs[year] = np.array([[k,v] for k in a_actives[year] for v in np.sort(ac_adjs[year][k])]).T
    a_embeddings[year] = np.array([a_embeddings[year][a_id] for a_id in a_actives[year]])
    a_cite_edgellhs[year] = [a_cite_edgellhs[year][a_id] for a_id in a_actives[year]]
    a_pub_edgellhs[year] = [a_pub_edgellhs[year][a_id] for a_id in a_actives[year]]
    a_latest_embeddings[year] = np.array([a_latest_embeddings[year][a_id] for a_id in a_actives[year]])
    
    pickle.dump((a_actives[year], a_positions[year], ac_adjs[year], a_embeddings[year], a_latest_embeddings[year], a_cite_edgellhs[year]), open(f'cite_input/a_cite_inputs_{year}','wb'), -1)
    pickle.dump((a_actives[year], a_positions[year], ac_adjs[year], a_embeddings[year], a_latest_embeddings[year], a_pub_edgellhs[year]), open(f'pub_input/a_pub_inputs_{year}','wb'), -1)
    
    print(f'Finish year {year} node size {len(a_actives[year])} edge size {len(ac_adjs[year][0])}')
    
pickle.dump(a_actives, open('author/a_actives.pkl','wb'),-1)
pickle.dump(ac_adjs, open('author/active_ac_adjs.pkl','wb'),-1)

## Validate

In [None]:
ca_adjs = pickle.load(open('content/active_ca_adjs.pkl','rb'))
c_actives = pickle.load(open('content/c_actives.pkl','rb'))
ac_adjs = pickle.load(open('author/active_ac_adjs.pkl','rb'))
a_actives = pickle.load(open('author/a_actives.pkl','rb'))

In [None]:
c_set0, c_set1, c_set2 = set(), set(), set()
a_set0, a_set1, a_set2 = set(), set(), set()

for year in range(2000, 2019):
    
    for c in c_actives[year]:
        c_set0.add(c)
    for c in ca_adjs[year][0]:
        c_set1.add(c)
    for c in ac_adjs[year][1]:
        c_set2.add(c)
        
    for a in a_actives[year]:
        a_set0.add(a)
    for a in ca_adjs[year][1]:
        a_set1.add(a)
    for a in ac_adjs[year][0]:
        a_set2.add(a)

In [None]:
print(c_set0==c_set1==c_set2, a_set0==a_set1==a_set2)