In [None]:
import pickle
import numpy as np
from collections import defaultdict

from Author import *
from Content import *

In [None]:
group_names = [2000, 2013, 201700, 201701, 201800, 201801, 
               201900, 201901, 201902, 201903, 201904, 201905, 
               201906, 201907, 201908, 201909, 201910, 201911, 
               201912, 201913, 201914]

## Devide & Create Authors

In [None]:
start_year, end_year = 1980, 2019
content_authors = pickle.load(open('content/content_authors.pkl','rb'))
content_year = pickle.load(open('content/content_year.pkl','rb'))
author_last_active = defaultdict(int)

for year in range(start_year, end_year):
    
    contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))    
    for content in contents.values():
        
        incitation_years = sorted(content.incitations)
        last_active = max(incitation_years[-1], year) if len(incitation_years)>0 else year
        
        for author_id, _, _ in content.authors:            
            if author_last_active[author_id] < last_active:
                author_last_active[author_id] = last_active
                
active_authors = defaultdict(list)
for author, year in author_last_active.items():
    active_authors[year].append(author)

pickle.dump(active_authors, open('author/active_authors.pkl', 'wb'), -1)

In [None]:
author_groups = defaultdict(set)
for year in range(1980, 2000):
    for author in active_authors[year]:
        author_groups[2000].add(author)

for year in range(2000, 2013):
    for author in active_authors[year]:
        author_groups[2013].add(author)
        
for year in range(2013, 2017):
    for index, author in enumerate(active_authors[year]):
        author_groups[201700+index%2].add(author)
        
for year in range(2017, 2018):
    for index, author in enumerate(active_authors[year]):
        author_groups[201800+index%2].add(author)
        
for year in range(2018, 2019):
    for index, author in enumerate(active_authors[year]):
        author_groups[201900+index%15].add(author)
        
pickle.dump(author_groups, open('author/author_groups.pkl', 'wb'), -1)
print(sum([len(authors) for authors in active_authors.values()]), sum([len(authors) for authors in author_groups.values()]))

In [None]:
start_year, end_year = 1980, 2019
content_authors = pickle.load(open('content/content_authors.pkl','rb'))
content_year = pickle.load(open('content/content_year.pkl','rb'))
author_groups = pickle.load(open('author/author_groups.pkl','rb'))

for group_name, group_authors in author_groups.items():
    authors = {}
    
    for year in range(start_year, end_year):
        contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))        
        for content in contents.values():           
            
            for author_id, author_name, author_org in content.authors:
                if author_id in group_authors:
                    
                    if author_id not in authors:
                        data = {'id':author_id, 'name':author_name, 'org':author_org}
                        authors[author_id] = Author(data)
                    
                    authors[author_id].update_contents(year, content.id, content.field, content.venue)
                    
                    outcitations = [(year, content_year[in_content], in_author) for in_content in content.outcitations for in_author in content_authors[in_content]]
                    authors[author_id].update_outcitations(outcitations)
                    
                    incitations = [(out_year, year, out_author) for out_year, out_contents in content.incitations.items() for out_content in out_contents for out_author in content_authors[out_content]]
                    authors[author_id].update_incitations(incitations)
                    
    for author in authors.values():
        author.update_fields()
        
    pickle.dump(authors, open('author/authors_{}.pkl'.format(group_name), 'wb'), -1)
    print(f'Done for group {group_name}: {len(group_authors)}=={len(authors)}')
    del authors

In [None]:
author_venues = {}
author_contents = defaultdict(dict)

for group_name in group_names:
    authors = pickle.load(open('author/authors_{}.pkl'.format(group_name),'rb'))
    for author in authors.values():
        author_venues[author.id] = author.venues
        for year, contents in author.contents.items():
            author_contents[author.id][year] = contents
        
    print(f'Done for group {group_name}')
    
pickle.dump(author_venues, open('author/author_venues.pkl','wb'), -1)
pickle.dump(author_contents, open('author/author_contents.pkl','wb'), -1)

## A-A & A-C & A-U Graph

In [None]:
aa_adjs = defaultdict(list)
ac_adjs = {}
au_adjs = {}

for group_name in group_names:
    authors = pickle.load(open('author/authors_{}.pkl'.format(group_name),'rb'))
    for author_id, author in authors.items():
        for year, contents in author.contents.items():
            ac_adjs[author_id+str(year)] = [content+str(year) for content in contents]
        for year, venues in author.venues.items():
            au_adjs[author_id+str(year)] = [venue+str(year) for venue in venues]
        for outyear, outcitations in author.outcitations.items():
            for inyear, inauthor in outcitations:
                aa_adjs[author_id+str(outyear)].append(inauthor+str(inyear))
    del authors
    print('done for group {}'.format(group_name))
    
pickle.dump(aa_adjs, open('author/aa_adjs.pkl', 'wb'), -1)
pickle.dump(ac_adjs, open('author/ac_adjs.pkl', 'wb'), -1)
pickle.dump(au_adjs, open('author/au_adjs.pkl', 'wb'), -1)

In [None]:
a_embs = pickle.load(open('author/a_embs.pkl','rb'))

update_counts = 0
for group_name in group_names:
    
    print('Reading group {}'.format(group_name))
    authors = pickle.load(open('author/authors_{}.pkl'.format(group_name),'rb'))
    
    print('Embedding-update start for group {}'.format(group_name))
    for author_id, author in authors.items():        
        for year in author.contents.keys():
            author.update_embeddings(year, a_embs[author_id+str(year)])
            update_counts += 1
            
    print('Writing group {}'.format(group_name))
    pickle.dump(authors, open('author/authors_{}.pkl'.format(group_name), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    del authors
    
    print('Embedding-update done for group {}'.format(group_name))
    print()
    
print('Count check: {} vs {}'.format(len(a_embs), update_counts))