In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import os

from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df = pd.read_pickle('../../results/embedding_df.p')

In [3]:
##### Keep only some journals for ease the analysis
# df = df[df.publicationName.isin(['Scientometrics','Journal of Informetrics',
#                                   'Minerva', 'Social Studies of Science', 
#                                   'Research Policy','Science and Public Policy'])]

In [4]:
def extract_element(l,pos=0):
    if l is None:
        return [np.nan]
    else:
        return list(map(lambda x: x[pos],l))

In [5]:
## Keep only obs with affiliation info
df = df[-df['affiliation'].isnull()].copy()

In [6]:
def build_metadata(df):
    #number of authors and institutions
    df['n_authors'] = df.authors.apply(lambda x: len(x))
    df['n_institutions'] = df.affiliation.apply(lambda x:len(set(extract_element(x,1))))

    # first author country
    df['first_author_country'] = df.affiliation.apply(lambda x: extract_element(x,pos=3)[0])
    
    # citation ranking
    
    q1,q2,q3 = df.citedby_count.quantile([.25, .5, .75])

    df['citedby_rank'] = None
    df.loc[df.citedby_count==0,'citedby_rank'] = 'zero'
    df.loc[(df.citedby_count > 0) & (df.citedby_count <= q1),'citedby_rank'] = 'low'
    df.loc[(df.citedby_count >q1) & (df.citedby_count <= q2),'citedby_rank'] = 'mid-low'
    df.loc[(df.citedby_count >q2) & (df.citedby_count <= q3),'citedby_rank'] = 'mid-high'
    df.loc[(df.citedby_count >q3),'citedby_rank'] = 'high'

    # international collaboration
    international_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,pos=3)))>1)
    institutional_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,1)))>1)
    internal_collaboration_mask = (df.n_institutions == 1) & (df.n_authors >1)
    single_author_mask = df.n_authors == 1

    df['collaboration_status'] = None
    df.loc[single_author_mask,'collaboration_status']                = 'single_author'
    df.loc[internal_collaboration_mask,'collaboration_status']       = 'internal_colab'
    df.loc[institutional_collaboration_mask,'collaboration_status']  = 'institutional_colab'
    df.loc[international_collaboration_mask,'collaboration_status']  = 'international_colab'
    return df

In [7]:
df = build_metadata(df)

# cosine similarity

In [8]:
cos_sim_gnn = cosine_similarity(np.vstack(df.embedding.values))
cos_sim_d2v = cosine_similarity(np.vstack(df.d2v_vec.values))
cos_sim_tm = cosine_similarity(np.vstack(df.topic_dist.values))
cos_sim_bert = cosine_similarity(np.vstack(df.bert_embedding.values))

cos_sim_gnn[np.diag_indices_from(cos_sim_gnn)] = np.nan
cos_sim_d2v[np.diag_indices_from(cos_sim_d2v)] = np.nan
cos_sim_tm[np.diag_indices_from(cos_sim_tm)] = np.nan
cos_sim_bert[np.diag_indices_from(cos_sim_bert)] = np.nan

In [9]:
def dist_df(sim_matrix, metadata):
    cos_sim_df = pd.DataFrame(sim_matrix)
    cos_sim_df.index = metadata.eid
    cos_sim_df.columns = metadata.eid
    #cos_sim_df = cos_sim_df.where(np.triu(np.ones(cos_sim_df.shape),k=1).astype(np.bool))
    cos_sim_df=cos_sim_df.stack()
    cos_sim_df.index.rename(['eid_1', 'eid_2'], inplace= True)
    cos_sim_df = cos_sim_df.reset_index()
    cos_sim_df.columns = ['eid_1', 'eid_2', 'dist']

    metadata = metadata.loc[:, ['eid','publicationName','citedby_count','citedby_rank','first_author_country','collaboration_status']]

    cos_sim_df = pd.merge(cos_sim_df,metadata, how='left',left_on='eid_1',right_on='eid')
    cos_sim_df = pd.merge(cos_sim_df,metadata, how='left',left_on='eid_2',right_on='eid')

    cos_sim_df.columns = ['eid_1', 'eid_2', 'dist',
                          'eid_x', 'publicationName_1','citedby_count_1','citedby_rank_1', 'first_author_country_1', 'collaboration_status_1',
                          'eid_y', 'publicationName_2','citedby_count_2','citedby_rank_2', 'first_author_country_2', 'collaboration_status_2']
    cos_sim_df = cos_sim_df.loc[:,['eid_1', 'eid_2', 'dist', 
                                   'publicationName_1','citedby_count_1','citedby_rank_1', 'first_author_country_1', 'collaboration_status_1',
                                   'publicationName_2', 'citedby_count_2','citedby_rank_2','first_author_country_2','collaboration_status_2']]
    return cos_sim_df



In [10]:
cos_sim_gnn_df = dist_df(cos_sim_gnn, df)
cos_sim_d2v_df = dist_df(cos_sim_d2v, df)
cos_sim_tm_df = dist_df(cos_sim_tm, df)
cos_sim_bert_df = dist_df(cos_sim_bert, df)

KeyboardInterrupt: 

### Relation between number of citations and distance

As the number of citations is a skewed power-law distribution, almost all of the articles are collapsed towards the (0,0). Is hard to identify a relation

In [11]:
#frobenious norm
df['embedding_norm'] = df.embedding.apply(np.linalg.norm)
df['d2v_norm'] = df.d2v_vec.apply(np.linalg.norm)
df['tm_norm'] = df.topic_dist.apply(np.linalg.norm)
df['bert_norm'] = df.bert_embedding.apply(np.linalg.norm)

In [12]:
embedding_norm = df.groupby('citedby_rank').apply(lambda x: np.mean(x['embedding_norm']))
embedding_norm.index = ['high', 'mid-high', 'mid-low','low', 'zero']
embedding_norm

high        1.455438
mid-high    1.008022
mid-low     1.152180
low         1.058051
zero        0.979214
dtype: float64

In [14]:
tm_norm = df.groupby('citedby_rank').apply(lambda x: np.mean(x['tm_norm']))
tm_norm.index = ['high', 'mid-high', 'mid-low','low', 'zero']
tm_norm


high        0.477899
mid-high    0.450116
mid-low     0.456958
low         0.453014
zero        0.445678
dtype: float64

In [15]:
bert_norm = df.groupby('citedby_rank').apply(lambda x: np.mean(x['bert_norm']))
bert_norm.index = ['high', 'mid-high', 'mid-low','low', 'zero']
bert_norm


high        17.486681
mid-high    19.087508
mid-low     18.326033
low         18.819401
zero        19.133680
dtype: float64

## Frobenious norm

Overall, the distribution of citations is correlated with the journals, as well as the collaboration type. 

- in the case of GNN, the highly cited have a larger norm. This make sense with the inner link prediction task (inner product of the embedding matrix as probabilistic adjacency matrix)

In [16]:
# norm_embeddings_df = pd.DataFrame([embedding_norm, bert_norm, d2v_norm, tm_norm],index=['GNN','BERT','D2V','LDA']).stack().reset_index()
norm_embeddings_df = pd.DataFrame([embedding_norm, bert_norm],index=['GNN','BERT']).stack().reset_index()
norm_embeddings_df.columns = ['model','citation_rank','mean frobenious norm']

In [18]:
norm_embeddings_df.to_csv('../../results/frob_norm.csv')

### Collaboration Cosine Similarity controlled by Journal

In [None]:
def av_dist_collaboration_journal(cos_sim_df, journals=['scientometrics','research policy']):
    cos_sim_df_filter = cos_sim_df.loc[(cos_sim_df.publicationName_1.isin(journals)) & (cos_sim_df.publicationName_2.isin(journals))]
    av_dist_collaboration = cos_sim_df_filter.groupby(['collaboration_status_1','collaboration_status_2', 'publicationName_1', 'publicationName_2']).apply(lambda x: np.mean(x.dist))
    av_dist_collaboration_df = pd.DataFrame(av_dist_collaboration).reset_index(drop=False).rename(columns = {0:'cos_sim'})    
    return av_dist_collaboration_df

#### GNN

In [None]:
av_dist_collaboration_df_gnn = av_dist_collaboration_journal(cos_sim_gnn_df)

In [84]:
av_dist_collaboration_df_gnn.to_csv('../../results/av_dist_collaboration_df_gnn.csv')

#### LDA

In [69]:
av_dist_collaboration_df_tm = av_dist_collaboration_journal(cos_sim_tm_df)

In [86]:
av_dist_collaboration_df_tm.to_csv('../../results/av_dist_collaboration_df_tm.csv')

#### BERT

In [70]:
av_dist_collaboration_df_bert = av_dist_collaboration_journal(cos_sim_bert_df)

In [87]:
av_dist_collaboration_df_bert.to_csv('../../results/av_dist_collaboration_df_bert.csv')