In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import os


import torch
from torch.utils.tensorboard import SummaryWriter
from libs.ScisciDataset import ScisciDataset
import torch_geometric.transforms as T
from torch_geometric.nn import GAE

from libs.ComputeResult import EncoderGCN
from torch_geometric.nn.models import InnerProductDecoder


from sklearn.metrics.pairwise import cosine_similarity


# create dataset with embeddings

In [2]:
# dataset
dataset_bert = ScisciDataset(root='data/bert',raw_path='../../data/torch/bert/',transform=T.NormalizeFeatures())

In [3]:
#load dataset w/ metadata and indices order to check
indices = pd.read_pickle('../../data/torch/bert/eid_row_indices.p')
del indices['index']
df = pd.read_pickle('../../data/dataset.p')

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = GAE(encoder = EncoderGCN(in_channels=dataset_bert.num_features, out_channels=32),
    decoder=InnerProductDecoder())

model.load_state_dict(torch.load('models/GCN_bert_model.pt'))
model = model.eval()

data = dataset_bert[0]

x = data.x.to(device)
edge_index = data.edge_index.to(device)
model = model.to(device)
z = model.encode(x,edge_index)
z = z.cpu().detach().numpy()

In [5]:
df = pd.merge(indices,df, how='left', left_on=['eid'], right_on=['eid'])

df['embedding'] = z.tolist()

In [6]:
# Set up a logs directory, so Tensorboard knows where to look for files
log_dir='runs/projector/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [7]:
def extract_element(l,pos=0):
    if l is None:
        return [np.nan]
    else:
        return list(map(lambda x: x[pos],l))

In [8]:
## Keep only obs with affiliation info
df = df[-df['affiliation'].isnull()].copy()

In [9]:
def build_metadata(df):
    #number of authors and institutions
    df['n_authors'] = df.authors.apply(lambda x: len(x))
    df['n_institutions'] = df.affiliation.apply(lambda x:len(set(extract_element(x,1))))

    # first author country
    df['first_author_country'] = df.affiliation.apply(lambda x: extract_element(x,pos=3)[0])
    
    # citation ranking
    
    q1,q2,q3 = df.citedby_count.quantile([.25, .5, .75])

    df['citedby_rank'] = None
    df.loc[df.citedby_count==0,'citedby_rank'] = 'zero'
    df.loc[(df.citedby_count > 0) & (df.citedby_count <= q1),'citedby_rank'] = 'low'
    df.loc[(df.citedby_count >q1) & (df.citedby_count <= q2),'citedby_rank'] = 'mid-low'
    df.loc[(df.citedby_count >q2) & (df.citedby_count <= q3),'citedby_rank'] = 'mid-high'
    df.loc[(df.citedby_count >q3),'citedby_rank'] = 'high'

    # international collaboration
    international_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,pos=3)))>1)
    institutional_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,1)))>1)
    internal_collaboration_mask = (df.n_institutions == 1) & (df.n_authors >1)
    unique_author_mask = df.n_authors == 1

    df['collaboration_status'] = None
    df.loc[unique_author_mask,'collaboration_status']                = 'unique_author'
    df.loc[internal_collaboration_mask,'collaboration_status']       = 'internal_colab'
    df.loc[institutional_collaboration_mask,'collaboration_status']  = 'institutional_colab'
    df.loc[international_collaboration_mask,'collaboration_status']  = 'international_colab'
    
    df.loc[:,'coverDate'] = df.loc[:,'coverDate'].apply(lambda x: int(x.rsplit('-')[0]))



    return df

In [10]:
df = build_metadata(df)

In [11]:
df.columns

Index(['eid', 'title', 'abstract', 'affiliation', 'aggregationType',
       'authkeywords', 'authorgroup', 'authors', 'chemicals', 'citedby_count',
       'coverDate', 'description', 'doi', 'funding', 'funding_text',
       'citedby_link', 'contributor_group', 'language', 'publicationName',
       'references', 'subject_areas', 'issueIdentifier', 'volume',
       'text_clean', 'title_clean', 'topic_dist', 'tfidf_vec', 'd2v_vec',
       'bert_embedding', 'embedding', 'n_authors', 'n_institutions',
       'first_author_country', 'citedby_rank', 'collaboration_status'],
      dtype='object')

In [12]:
df.loc[:, df.columns != 'tfidf_vec'].to_pickle('../../results/embedding_df.p')

## Tensorboard projection

### create projector metadata

In [None]:
metadata_df = df.loc[:,['eid','title','publicationName','coverDate','citedby_count','n_authors','n_institutions','first_author_country','citedby_rank','collaboration_status','embedding']]


In [None]:
metadata_df.dtypes

In [None]:
metadata_df.sample(2)

In [None]:
metadata_list = metadata_df.loc[:,['title','publicationName','coverDate','citedby_count','n_authors','n_institutions','first_author_country','citedby_rank','collaboration_status','eid']].values.tolist()

In [41]:
##bug fix##
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
##bug fix##

In [42]:
writer = SummaryWriter(log_dir + 'gcn/')

writer.add_embedding(np.stack(df.embedding.values, axis=0),
                     metadata_list, 
                     tag= 'gcn',metadata_header=['title','publicationName','coverDate','citedby_count','n_authors','n_institutions','first_author_country','citedby_rank','collaboration_status','eid'])


writer.close()




In [43]:
writer = SummaryWriter(log_dir + 'd2v/')

writer.add_embedding(np.stack(df.d2v_vec.values, axis=0),
                     metadata_list, 
                     tag= 'd2v',metadata_header=['title','publicationName','coverDate','citedby_count','n_authors','n_institutions','first_author_country','citedby_rank','collaboration_status','eid'])


writer.close()




In [44]:
writer = SummaryWriter(log_dir + 'topic_modeling/')

writer.add_embedding(np.stack(df.topic_dist.values, axis=0),
                     metadata_list, 
                     tag= 'topic_modeling',metadata_header=['title','publicationName','coverDate','citedby_count','n_authors','n_institutions','first_author_country','citedby_rank','collaboration_status','eid'])


writer.close()




In [45]:
writer = SummaryWriter(log_dir + 'bert/')

writer.add_embedding(np.stack(df.bert_embedding.values, axis=0),
                     metadata_list, 
                     tag= 'bert',metadata_header=['title','publicationName','coverDate','citedby_count','n_authors','n_institutions','first_author_country','citedby_rank','collaboration_status','eid'])


writer.close()

# Embedding analysis

In [33]:
df = pd.read_pickle('../../results/embedding_df.p')

In [34]:
#journals= ['social studies of science','science and education',
#'scientometrics','journal of informetrics',
#'research policy','science and public policy',
#'synthese','studies in history and philosophy of science']

journals= ['research policy','science and public policy',
'scientometrics','journal of informetrics',
'synthese','studies in history and philosophy of science',
'research evaluation','public understanding of science']

In [35]:
df = df[df.publicationName.isin(journals)]

In [36]:
def extract_element(l,pos=0):
    if l is None:
        return [np.nan]
    else:
        return list(map(lambda x: x[pos],l))

In [37]:
## Keep only obs with affiliation info
df = df[-df['affiliation'].isnull()].copy()

In [38]:
def build_metadata(df):
    #number of authors and institutions
    df['n_authors'] = df.authors.apply(lambda x: len(x))
    df['n_institutions'] = df.affiliation.apply(lambda x:len(set(extract_element(x,1))))

    # first author country
    df['first_author_country'] = df.affiliation.apply(lambda x: extract_element(x,pos=3)[0])
    
    # citation ranking
    
    q1,q2,q3 = df.citedby_count.quantile([.25, .5, .75])

    df['citedby_rank'] = None
    df.loc[df.citedby_count==0,'citedby_rank'] = 'zero'
    df.loc[(df.citedby_count > 0) & (df.citedby_count <= q1),'citedby_rank'] = 'low'
    df.loc[(df.citedby_count >q1) & (df.citedby_count <= q2),'citedby_rank'] = 'mid-low'
    df.loc[(df.citedby_count >q2) & (df.citedby_count <= q3),'citedby_rank'] = 'mid-high'
    df.loc[(df.citedby_count >q3),'citedby_rank'] = 'high'

    # international collaboration
    international_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,pos=3)))>1)
    institutional_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,1)))>1)
    internal_collaboration_mask = (df.n_institutions == 1) & (df.n_authors >1)
    unique_author_mask = df.n_authors == 1

    df['collaboration_status'] = None
    df.loc[unique_author_mask,'collaboration_status']                = 'unique_author'
    df.loc[internal_collaboration_mask,'collaboration_status']       = 'internal_colab'
    df.loc[institutional_collaboration_mask,'collaboration_status']  = 'institutional_colab'
    df.loc[international_collaboration_mask,'collaboration_status']  = 'international_colab'
    return df

In [39]:
df = build_metadata(df)

In [40]:
df.collaboration_status.value_counts()

unique_author          7364
international_colab    3478
internal_colab         3276
institutional_colab    3203
Name: collaboration_status, dtype: int64

In [41]:
def dimensionality_reduction(embedding, rnd_state=1234):
    embedding_list = [np.array(x) for x in embedding]
    if len(embedding_list[0])>60:
        embedding_list = PCA(n_components=20,random_state=rnd_state).fit_transform(embedding_list)
    reduced_embed = TSNE(random_state=rnd_state).fit_transform(embedding_list)
    x,y= zip(*reduced_embed)
    return x,y
    

In [None]:
df['xs_gnn'], df['ys_gnn'] = dimensionality_reduction(df.embedding.values)
df['xs_d2v'], df['ys_d2v'] = dimensionality_reduction(df.d2v_vec.values)
df['xs_tm'], df['ys_tm']   = dimensionality_reduction(df.topic_dist.values)
df['xs_bert'], df['ys_bert'] = dimensionality_reduction(df.bert_embedding.values)

In [None]:
df[['eid','publicationName','citedby_count','xs_gnn', 'ys_gnn', 'xs_d2v',
       'ys_d2v', 'xs_tm', 'ys_tm', 'xs_bert', 'ys_bert']].to_csv('../../results/articles_tsne_proj.csv')