In [248]:
#!pip install pycountry_convert

In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import os

from sklearn.metrics.pairwise import cosine_similarity

import pycountry_convert as pc

In [2]:
df = pd.read_pickle('../../results/embedding_df.p')

In [3]:
def extract_element(l,pos=0):
    if l is None:
        return [np.nan]
    else:
        return list(map(lambda x: x[pos],l))

In [4]:
## Keep only obs with affiliation info
df = df[-df['affiliation'].isnull()].copy()

In [5]:
def build_metadata(df):
    #number of authors and institutions
    df['n_authors'] = df.authors.apply(lambda x: len(x))
    df['n_institutions'] = df.affiliation.apply(lambda x:len(set(extract_element(x,1))))

    # first author country
    df['first_author_country'] = df.affiliation.apply(lambda x: extract_element(x,pos=3)[0])
    
    # citation ranking
    
    q1,q2,q3 = df.citedby_count.quantile([.25, .5, .75])

    df['citedby_rank'] = None
    df.loc[df.citedby_count==0,'citedby_rank'] = 'zero'
    df.loc[(df.citedby_count > 0) & (df.citedby_count <= q1),'citedby_rank'] = 'low'
    df.loc[(df.citedby_count >q1) & (df.citedby_count <= q2),'citedby_rank'] = 'mid-low'
    df.loc[(df.citedby_count >q2) & (df.citedby_count <= q3),'citedby_rank'] = 'mid-high'
    df.loc[(df.citedby_count >q3),'citedby_rank'] = 'high'

    # international collaboration
    international_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,pos=3)))>1)
    institutional_collaboration_mask = df.affiliation.apply(lambda x: len(set(extract_element(x,1)))>1)
    internal_collaboration_mask = (df.n_institutions == 1) & (df.n_authors >1)
    unique_author_mask = df.n_authors == 1

    df['collaboration_status'] = None
    df.loc[unique_author_mask,'collaboration_status']                = 'unique_author'
    df.loc[internal_collaboration_mask,'collaboration_status']       = 'internal_colab'
    df.loc[institutional_collaboration_mask,'collaboration_status']  = 'institutional_colab'
    df.loc[international_collaboration_mask,'collaboration_status']  = 'international_colab'
    return df

In [6]:
df = build_metadata(df)

In [7]:
rep_dict = {"Cote d'Ivoire":"Côte d'Ivoire",
           'Germany (Democratic Republic, DDR)':'Germany',
           'Libyan Arab Jamahiriya': 'Libya',
           'Russian Federation': 'Russia'}

In [8]:
df.first_author_country.replace(rep_dict,inplace=True)

In [9]:
def dimensionality_reduction(embedding, rnd_state=1234):
    embedding_list = [np.array(x) for x in embedding]
    if len(embedding_list[0])>60:
        embedding_list = PCA(n_components=10,random_state=rnd_state).fit_transform(embedding_list)
    reduced_embed = TSNE(random_state=rnd_state).fit_transform(embedding_list)
    x,y= zip(*reduced_embed)
    return x,y
    

In [10]:
df['xs_gnn'], df['ys_gnn'] = dimensionality_reduction(df.embedding.values)
df['xs_d2v'], df['ys_d2v'] = dimensionality_reduction(df.d2v_vec.values)
df['xs_lda'], df['ys_lda']   = dimensionality_reduction(df.topic_dist.values)
df['xs_bert'], df['ys_bert']   = dimensionality_reduction(df.bert_embedding.values)

## Average embedding by country

In [172]:
# First TSNE, then mean embedding
df_countries = df[['first_author_country','xs_gnn','ys_gnn','xs_d2v','ys_d2v','xs_lda','ys_lda','xs_bert','ys_bert']].groupby('first_author_country').mean()

In [12]:
def average_embedding(embeds):
    embeds = np.array(list(map(lambda x: np.array(x),embeds)))
    return list(np.average(embeds,axis=0))

In [174]:
avg_embed = df.groupby('first_author_country').agg({'embedding' :average_embedding, 'bert_embedding':average_embedding})

In [175]:
df_countries = df_countries.join(avg_embed).reset_index()

In [176]:
# adding articles and citations by country
n_articles = pd.DataFrame(df.first_author_country.value_counts()).reset_index()
n_articles.columns = ["first_author_country","n"]

n_citations = pd.DataFrame(df.groupby('first_author_country').apply(lambda x:  np.sum(x['citedby_count']))).reset_index()
n_citations.columns = ["first_author_country","n_citations"]

In [177]:
df_countries = df_countries.merge(n_articles, how='left',on = 'first_author_country')
df_countries = df_countries.merge(n_citations, how='left',on = 'first_author_country')

In [178]:
def country_to_continent(country_name):
    country_code = pc.country_name_to_country_alpha2(country_name, cn_name_format="default")
    continent_code = pc.country_alpha2_to_continent_code(country_code)
    continents = {
    'NA': 'North America',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    'EU':'Europe'}
    continent_name = continents[continent_code]
    return continent_name

In [179]:
df_countries['continent'] = df_countries.first_author_country.apply(lambda x: country_to_continent(x))

In [180]:
df_countries

Unnamed: 0,first_author_country,xs_gnn,ys_gnn,xs_d2v,ys_d2v,xs_lda,ys_lda,xs_bert,ys_bert,embedding,bert_embedding,n,n_citations,continent
0,Afghanistan,-22.084497,21.596331,-7.935746,-9.291619,30.279623,21.995905,-39.232651,10.866203,"[0.09937576204538345, -0.1499500572681427, 0.0...","[-0.14081211388111115, 0.6634104251861572, 0.3...",1,12,Asia
1,Argentina,2.664158,-11.712130,0.476959,2.989679,-3.436787,-1.902636,1.965858,-5.882326,"[-0.0688692871082042, -0.004559567078415837, -...","[-0.25964549709377543, 0.544784774045859, 0.20...",70,706,South America
2,Armenia,3.740608,58.245869,-62.793720,29.231318,8.083354,38.987820,52.023407,13.169700,"[0.08625378459692001, -0.13854365050792694, 0....","[-0.5790920853614807, 0.6632298827171326, 0.54...",1,0,Asia
3,Australia,2.668652,-7.274022,1.988526,0.105757,-5.503249,-8.777452,-5.466384,-7.773093,"[-0.06267788283064635, 0.0051406678594105, -0....","[-0.12296873397923781, 0.49959004517444106, 0....",685,13534,Australia
4,Austria,-3.013250,0.901200,-2.126365,1.159832,2.106125,-3.933488,-5.148671,1.685317,"[-0.007104162813158911, -0.06734173420838194, ...","[-0.1574103880621159, 0.5258863435087812, 0.29...",173,4276,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,United Kingdom,2.274377,3.265688,0.615027,-0.389286,5.300108,-11.026015,-2.822585,0.726291,"[-0.02675968813502442, -0.0116431515506117, -0...","[-0.13474996886606877, 0.533865930632009, 0.26...",2828,105688,Europe
104,United States,2.119999,-8.611614,0.122963,-0.380173,-5.684837,-10.479656,-0.989244,-7.361090,"[-0.07600222199020047, 0.019961209373258345, -...","[-0.14122438095973747, 0.5103014469079801, 0.1...",5390,166892,North America
105,Uruguay,5.098570,40.060040,-16.927858,-6.760378,14.708404,19.112629,-0.555881,13.596688,"[0.034609605092555285, -0.015240292355883867, ...","[-0.442035929299891, 0.5869422224350274, 0.340...",16,260,South America
106,Venezuela,-9.468658,-14.571891,-0.668383,1.751039,5.612797,-1.166492,-10.804595,-14.220075,"[-0.12309855882726285, 0.1126473231807999, -0....","[-0.3441402609763748, 0.5541256164079127, 0.10...",23,239,South America


# Average Cosine similarity

In [234]:
cos_sim_gnn = cosine_similarity(np.vstack(df_countries.embedding.values))
cos_sim_bert = cosine_similarity(np.vstack(df_countries.bert_embedding.values))

cos_sim_gnn_df = pd.DataFrame(np.mean(cos_sim_gnn, 0), index=df_countries.first_author_country, columns=['GNN'])
cos_sim_bert_df = pd.DataFrame(np.mean(cos_sim_bert, 0), index=df_countries.first_author_country, columns=['BERT'])

cos_sim_df = cos_sim_gnn_df.join(cos_sim_bert_df).reset_index()
cos_sim_df =cos_sim_df.merge(df_countries[['first_author_country', 'n','n_citations','continent']])

In [246]:
cos_sim_df.to_csv('../../results/country_average_sim.csv')

## Quantitative Qualitative Axis

In [13]:
def average_embedding(embeds):
    embeds = np.array(list(map(lambda x: np.array(x),embeds)))
    return list(np.average(embeds,axis=0))

In [18]:
# First TSNE, then mean embedding
df_journal = df[['publicationName','xs_gnn','ys_gnn','xs_d2v','ys_d2v','xs_lda','ys_lda','xs_bert','ys_bert']].groupby('publicationName').mean().reset_index()

n_articles = pd.DataFrame(df.publicationName.value_counts()).reset_index()
n_articles.columns = ["publicationName","n"]

n_citations = pd.DataFrame(df.groupby('publicationName').apply(lambda x:  np.sum(x['citedby_count']))).reset_index()
n_citations.columns = ["publicationName","n_citations"]

df_journal = df_journal.merge(n_articles, how='left',on = 'publicationName')
df_journal = df_journal.merge(n_citations, how='left',on = 'publicationName')

In [14]:
avg_embed = df.groupby('publicationName').agg({'embedding':average_embedding, 'bert_embedding': average_embedding})

bert_axis = np.array(avg_embed.loc['journal of informetrics','bert_embedding']) - np.array(avg_embed.loc['isis','bert_embedding'])
gnn_axis = np.array(avg_embed.loc['journal of informetrics','embedding']) - np.array(avg_embed.loc['isis','embedding'])
gnn_axis2 = np.array(avg_embed.loc['research policy','embedding']) - np.array(avg_embed.loc['science, technology and society','embedding'])

cossim_gnn = cosine_similarity(np.array([np.array(x) for x in avg_embed.embedding]),gnn_axis.reshape(1,-1))
cossim_gnn2 = cosine_similarity(np.array([np.array(x) for x in avg_embed.embedding]),gnn_axis2.reshape(1,-1))
cossim_bert = cosine_similarity(np.array([np.array(x) for x in avg_embed.bert_embedding]),bert_axis.reshape(1,-1))


avg_embed['cossim_gnn'] = cossim_gnn
avg_embed['cossim_gnn2'] = cossim_gnn2
avg_embed['cossim_bert'] = cossim_bert

#avg_embed.reset_index(inplace=True)

In [19]:
avg_embed = avg_embed.merge(df_journal.loc[:,['publicationName','n','n_citations']], how='left',on = 'publicationName')

In [20]:
quant = ['scientometrics','journal of informetrics']
hist_phil = ['british journal for the history of science','isis','science and education',            
             'science and technology studies','science, technology and society','social studies of science',
             'studies in history and philosophy of science','synthese']
management = ['research policy','science and public policy']
social_sci = ['minerva','public understanding of science','research evaluation', 'science, technology and human values']



avg_embed.loc[avg_embed.publicationName.isin(quant), 'field'] = 'Library and Information Sciences'
avg_embed.loc[avg_embed.publicationName.isin(management), 'field'] = 'Management'
avg_embed.loc[avg_embed.publicationName.isin(social_sci), 'field'] = 'Education, Communication and Anthropology'
avg_embed.loc[avg_embed.publicationName.isin(hist_phil), 'field'] = 'History and Philosophy'

In [63]:
avg_embed.loc[:,['publicationName','cossim_gnn','cossim_bert','n','n_citations','field']].to_csv('../../results/avg_embed_journal.csv')