In [None]:
import json
import pickle
import numpy as np
from collections import defaultdict
from scipy.sparse import lil_matrix
from sklearn.decomposition import TruncatedSVD

from Content import *

## Extract Contents

In [None]:
def validate(line, next_venue_id):
    
    data = {}
    if 'id' in line and \
        'title' in line and \
        'year' in line and \
        'fos' in line and \
        'venue' in line and 'raw' in line['venue'] and \
        'authors' in line:
        
            if line['venue']['raw'] not in venue_id: 
                venue_id[line['venue']['raw']] = f'V{next_venue_id}'
                next_venue_id += 1
            
            data['id'] = 'C'+line['id']
            data['title'] = line['title']
            data['year'] = int(line['year'])
            data['venue'] = venue_id[line['venue']['raw']]            
            data['fos'] = [(fos.get('name'), float(fos.get('w'))) for fos in line['fos']]
            data['authors'] = [('A'+author.get('id'), author.get('name'), author.get('org')) for author in line['authors']]
            data['outcitations'] = ['C'+reference for reference in line['references']] if 'references' in line else []
            
            return data, next_venue_id
    
    return None, next_venue_id

In [None]:
def extract_years(years):
    
    contents = defaultdict(dict)
    next_venue_id = 0
    
    with open('raw/dblp_papers_v11.txt','r') as file:
        for i, line in enumerate(file):
            
            line = json.loads(line)
            if 'year' in line and years[0] <= int(line.get('year')) < years[1]:
                data, next_venue_id = validate(line, next_venue_id)
                
                if data != None:
                    contents[data['year']][data['id']] = Content(data)
                    content_year[data['id']] = data['year']
                    content_pool.add(data['id'])                 
                                    
            if i%100000 == 0:
                print('Extraction done for line {}'.format(i))
                
        print('Total: {}, Remaining: {}'.format(i+1, sum([len(contents[year]) for year in range(years[0],years[1])])))
            
    for year in range(years[0],years[1]):
        pickle.dump(contents[year], open('content/contents_{}.pkl'.format(year), 'wb'), -1)

In [None]:
start_year, end_year = 1980, 2019

content_pool = set()
content_year = {}
venue_id = {}

extract_years((start_year,end_year))

pickle.dump(content_pool, open('content/content_pool.pkl', 'wb'), -1)
pickle.dump(content_year, open('content/content_year.pkl', 'wb'), -1)
pickle.dump(venue_id, open('venue/venue_id.pkl', 'wb'), -1)

## Update Out/In-citations

In [None]:
start_year, end_year = 1980, 2019
content_year = pickle.load(open('content/content_year.pkl','rb'))
content_pool = pickle.load(open('content/content_pool.pkl','rb'))
incitations_dict = defaultdict(list)

for year in range(start_year, end_year):    
    contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))    
    for out_content in contents.values():        
        outcitations = []
        for in_content in out_content.outcitations:
            if in_content in content_pool and content_year[in_content]<out_content.year:
                    incitations_dict[in_content].append((out_content.id, out_content.year))
                    outcitations.append(in_content)
        out_content.outcitations = outcitations
    pickle.dump(contents, open('content/contents_{}.pkl'.format(year), 'wb'), -1)
    print('Out citation done for year {}'.format(year))
                
for year in range(start_year, end_year):    
    contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))    
    for out_content in contents.values():
        out_content.update_incitations(incitations_dict[out_content.id])    
    pickle.dump(contents, open('content/contents_{}.pkl'.format(year), 'wb'), -1)
    print('In citation done for year {}'.format(year))

## Field

In [None]:
start_year, end_year = 1980, 2019

field_pool = set()
venue_pool = set()
content_pool = set()
content_year = {}
content_authors = {}

paper_count = 0 # 3974198 
for year in range(start_year, end_year):    
    contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    paper_count += len(contents)
    for content in contents.values():        
        for field in content.fos:
            field_pool.add(field[0])               
        venue_pool.add(content.venue)
        content_pool.add(content.id)
        content_year[content.id] = year
        content_authors[content.id] = [author[0] for author in content.authors]
    print(f'Collection done for year {year}')
        
pickle.dump(field_pool, open('content/field_pool.pkl', 'wb'), -1)
pickle.dump(venue_pool, open('venue/venue_pool.pkl', 'wb'), -1)
pickle.dump(content_pool, open('content/content_pool.pkl', 'wb'), -1)
pickle.dump(content_year, open('content/content_year.pkl', 'wb'), -1)
pickle.dump(content_authors, open('content/content_authors.pkl', 'wb'), -1)

field_dict = dict((field,index) for index,field in enumerate(field_pool))
field_count = len(field_pool) # 106650

print(paper_count, field_count)

In [None]:
index = 0
field_weights = lil_matrix((paper_count, field_count), dtype=np.float32)
for year in range(start_year, end_year):    
    contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    for out_content in contents.values():        
        for field, weight in out_content.fos:
            field_weights[index, field_dict[field]] = weight
        index += 1
    print('Field done for year {}'.format(year))

In [None]:
field_reduced = TruncatedSVD(n_components=100).fit_transform(field_weights.tocsr())
pickle.dump(field_reduced, open('content/field_reduced.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
print('LSA done')

In [None]:
start_year, end_year = 1980, 2019
field_reduced = pickle.load(open('content/field_reduced.pkl','rb'))

index = 0
for year in range(start_year, end_year):    
    contents = pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    print(year, len(contents))
    for out_content in contents.values():
        out_content.update_field(field_reduced[index])
        index += 1
    pickle.dump(contents, open('content/contents_{}.pkl'.format(year), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
print('Total number of contents:', index)

## C-C & C-A & C-U Graph

In [None]:
cc_adjs = {}
ca_adjs = {}
cu_adjs = {}
content_year = pickle.load(open('content/content_year.pkl','rb'))

for year in range(1980, 2019):
    contents =  pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    for content in contents.values():
        cc_adjs[content.id+str(year)] = [outid+str(content_year[outid]) for outid in content.outcitations]
        ca_adjs[content.id+str(year)] = [author_id+str(year) for author_id,_,_ in content.authors]
        cu_adjs[content.id+str(year)] = [content.venue+str(year)]
    print('done for year {}'.format(year))

pickle.dump(cc_adjs, open('content/cc_adjs.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(ca_adjs, open('content/ca_adjs.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(cu_adjs, open('content/cu_adjs.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
c_embs = pickle.load(open('content/c_embs.pkl','rb'))

update_counts = 0
for year in range(1980, 2019):
    
    print('Reading year {}'.format(year))
    contents =  pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    
    print('Embedding-update start for year {}'.format(year))
    for content in contents.values():        
        content.update_embedding(c_embs[content.id+str(year)])
        update_counts += 1
            
    print('Writing year {}'.format(year))
    pickle.dump(contents, open('content/contents_{}.pkl'.format(year), 'wb'), -1)
    del contents
    
    print('Embedding-update done for year {}'.format(year))
    print()
    
print('Count check: {} vs {}'.format(len(c_embs), update_counts))

## Extra Information

In [None]:
content_year = pickle.load(open('content/content_year.pkl','rb'))
content_authors = pickle.load(open('content/content_authors.pkl', 'rb'))

authoryear_pool, venueyear_pool = set(), set()
content_inauthoryears, content_venueyears = defaultdict(list), {}

for year in range(1980, 2019):
    
    print('Reading year {}'.format(year))
    contents =  pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    
    print('Collection start for year {}'.format(year))
    for content in contents.values():        
        
        for author,_,_ in content.authors:
            authoryear_pool.add(author+str(year))
        venueyear_pool.add(content.venue+str(year))
        
        for incontent in content.outcitations:
            for inauthor in content_authors[incontent]:
                content_inauthoryears[content.id].append(inauthor+str(content_year[incontent]))
        content_venueyears[content.id] = content.venue+str(year)
    
    print('Collection done for year {}'.format(year))
    print()

pickle.dump(authoryear_pool, open('author/authoryear_pool.pkl','wb'),-1)
pickle.dump(venueyear_pool, open('venue/venueyear_pool.pkl','wb'),-1)
pickle.dump(content_inauthoryears, open('content/content_inauthoryears.pkl','wb'),-1)
pickle.dump(content_venueyears, open('content/content_venueyears.pkl','wb'),-1)

In [None]:
content_incontents = {}

for year in range(1980, 2019):
    
    print('Reading year {}'.format(year))
    contents =  pickle.load(open('content/contents_{}.pkl'.format(year),'rb'))
    
    print('Collection start for year {}'.format(year))
    for content in contents.values():        
        
        if len(content.outcitations)>0:
            content_incontents[content.id] = content.outcitations
    
    print('Collection done for year {}'.format(year))
    print()

pickle.dump(content_incontents, open('content/content_incontents.pkl','wb'),-1)