In [1]:
import pandas as pd
import numpy as np
import pickle
from igraph import *
from tqdm.notebook import tqdm
import matplotlib
%matplotlib inline



In [2]:
df = pd.read_pickle('../../data/dataset.p')

### 1.Subject areas

In [49]:
journals = df.filter(['publicationName','subject_areas'])

journals['subject_areas'] = journals.apply(lambda x: [l[0] for l in x.subject_areas],axis=1)

journals[['subject_areas_0','subject_areas_1','subject_areas_2','subject_areas_3','subject_areas_4','subject_areas_5']] = pd.DataFrame(journals.subject_areas.tolist())

journals.subject_areas = None

## subfields

In [53]:
quant = ['scientometrics','journal of informetrics']
hist_phil = ['british journal for the history of science','isis','science and education',            
             'science and technology studies','science, technology and society','social studies of science',
             'studies in history and philosophy of science','synthese']
management = ['research policy','science and public policy']
social_sci = ['minerva','public understanding of science','research evaluation', 'science, technology and human values']

df.loc[df.publicationName.isin(quant), 'field'] = 'Library and Information Sciences'
df.loc[df.publicationName.isin(management), 'field'] = 'Management'
df.loc[df.publicationName.isin(social_sci), 'field'] = 'Education, Communication and Anthropology'
df.loc[df.publicationName.isin(hist_phil), 'field'] = 'History and Philosophy'
#df.loc[df.publicationName.isin(education), 'field'] = 'Education'



In [55]:
def most_cited(series):

    return set(df.title[df.citedby_count == max(series)].values)

In [56]:
def mean_round(x):
    return round(np.mean(x),2)

In [67]:
def make_summary(df):

    summary_stats = df.groupby(['publicationName','field']).citedby_count.agg(['count', mean_round, 'max']).sort_values(by =['field','mean_round'])
    summary_stats.reset_index(inplace=True)

    summary_stats['publicationName'] = summary_stats.apply(lambda x: x.publicationName.title(),axis=1)
    total = ['Total',' ',summary_stats['count'].sum(),mean_round(summary_stats['mean_round']),summary_stats['max'].max()]

    summary_stats.loc['Total']= total

    summary_stats = summary_stats.sort_values(by=['field','count'], ascending=False)

    summary_stats.set_index(['field', 'publicationName'],inplace=True)
    return summary_stats

In [69]:
summary_stats = make_summary(df)

In [70]:
print(summary_stats.to_latex(index=True))

\begin{tabular}{llrrr}
\toprule
  &       &  count &  mean\_round &   max \\
field & publicationName &        &             &       \\
\midrule
Management & Research Policy &   3221 &       83.75 &  4820 \\
  & Science And Public Policy &   1707 &       13.27 &   462 \\
Library and Information Sciences & Scientometrics &   5136 &       20.04 &  1334 \\
  & Journal Of Informetrics &    876 &       22.63 &   352 \\
History and Philosophy & Synthese &   4151 &        8.53 &   910 \\
  & Social Studies Of Science &   1069 &       40.95 &  4709 \\
  & Science And Education &   1034 &       11.60 &   298 \\
  & Studies In History And Philosophy Of Science &    911 &        8.76 &   145 \\
  & Isis &    523 &       12.47 &   415 \\
  & Science, Technology And Society &    345 &        6.07 &   122 \\
  & British Journal For The History Of Science &    276 &        9.57 &    88 \\
  & Science And Technology Studies &    111 &        5.29 &    39 \\
Education, Communication and Anthropology & P

# 2.Network

In [12]:
raw_dir = '../../data/torch/bert/'
names = ['x.p', 'edge_pairs.p', 'eid_row_indices.p']
x,edge_pairs,eid_row_indices = [pickle.load(open('{}/{}'.format(raw_dir, name), 'rb'))for name in names]



In [13]:
edge_pairs = edge_pairs.detach().numpy()

edge_pairs_df = pd.DataFrame(edge_pairs).T

_ = edge_pairs_df.merge(eid_row_indices, left_on=0, right_on='index')
_ = _.merge(eid_row_indices, left_on=1, right_on='index')

edges = _.loc[:,['eid_x','eid_y']]

In [14]:
g = Graph.TupleList(edges.values,directed=False)

descriptive statistics

In [90]:
g.average_path_length()

6.1414321323727865

In [91]:
np.mean(g.degree())

8.29979490891543

In [92]:
np.max(g.degree())

282

In [93]:
g.vcount()

16578

In [94]:
len(set(edges.eid_x).union(set(edges.eid_y)))

16578

In [95]:
g.ecount()

68797

In [98]:
g.assortativity_degree()

-0.0060198543679408365

In [99]:
np.mean(g.betweenness())

37807.8781517674

In [104]:
g.is_connected()

False

In [100]:
comps= g.components()

In [101]:
giant = comps.giant()

In [102]:
giant.ecount()

68168

In [105]:
giant.vcount()

15615

In [109]:
print('Edges not in the giant component: {}'.format(g.ecount()-giant.ecount()))
print('Vertices not in the giant component: {}'.format(g.vcount()-giant.vcount()))

Edges not in the giant component: 629
Vertices not in the giant component: 963


In [112]:
np.mean(giant.closeness())

0.17105498895935597

In [114]:
np.mean(giant.eigenvector_centrality())
np.mean(g.eigenvector_centrality())

0.011214141865822723

In [115]:
g.farthest_points()

(15475, 15550, 37)

In [116]:
g.diameter()

37

In [120]:
g.summary()

'IGRAPH UN-- 16578 68797 -- \n+ attr: name (v)'

### small word

In [156]:
#clustering coefficient
clust_coef = g.transitivity_undirected()
clust_coef

0.08137529739399864

In [157]:
average_path_length = g.average_path_length()
average_path_length

6.1414321323727865

### random model

In [143]:
def random_model(n = g.vcount(), m = g.ecount(), times=100):
    rnd_clust_coefs = []
    rnd_lengths = []
    for i in tqdm(range(times)):
        rnd_graph = Graph.Erdos_Renyi(n=n,m=m)
        clust = rnd_graph.transitivity_undirected()
        length = rnd_graph.average_path_length()
        rnd_clust_coefs += [clust]
        rnd_lengths += [length]
    return rnd_clust_coefs, rnd_lengths

In [149]:
rnd_clust_coefs, rnd_lengths = random_model()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [153]:
mean_random_clust_coef = np.mean(rnd_clust_coefs)
mean_random_clust_coef

0.0005009197590637582

In [158]:
mean_random_length = np.mean(rnd_lengths)
mean_random_length

4.8269054722968345

In [159]:
clust_coef/mean_random_clust_coef

162.45176182726905

In [160]:
average_path_length / mean_random_length

1.2723332096765607

In [None]:
average_path_length

### 3. Differences between dataset and network 

In [78]:
network_ids = [line.rstrip('\n') for line in open('../../results/network_ids.txt')]

In [79]:
in_net_df = df[df.eid.isin(network_ids)]
out_net_df = df[-df.eid.isin(network_ids)]

In [81]:
out_net_df.citedby_count.mean()

18.99228422752557

In [82]:
out_net_df.loc[:,'coverDate'] = out_net_df.loc[:,'coverDate'].apply(lambda x: int(x.rsplit('-')[0]))

out_net_df.coverDate.mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


2001.289790059214

#### Out of network stats

In [75]:
oon_summary_stats = make_summary(out_net_df)
oon_summary_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean_round,max
field,publicationName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Management,Research Policy,643,79.72,3404
Management,Science And Public Policy,637,10.31,409
Library and Information Sciences,Scientometrics,784,19.73,435
Library and Information Sciences,Journal Of Informetrics,12,13.5,28
History and Philosophy,Synthese,1702,6.7,564
History and Philosophy,Studies In History And Philosophy Of Science,372,7.68,63
History and Philosophy,Isis,250,9.09,123
History and Philosophy,Science And Education,250,10.92,298
History and Philosophy,British Journal For The History Of Science,145,9.03,54
History and Philosophy,Social Studies Of Science,139,24.71,648


#### In network stats

In [85]:
in_net_summary = make_summary(in_net_df)
in_net_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean_round,max
field,publicationName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Management,Research Policy,2578,84.75,4820
Management,Science And Public Policy,1070,15.03,462
Library and Information Sciences,Scientometrics,4352,20.1,1334
Library and Information Sciences,Journal Of Informetrics,864,22.76,352
History and Philosophy,Synthese,2449,9.8,910
History and Philosophy,Social Studies Of Science,930,43.38,4709
History and Philosophy,Science And Education,784,11.82,177
History and Philosophy,Studies In History And Philosophy Of Science,539,9.5,145
History and Philosophy,Isis,273,15.57,415
History and Philosophy,"Science, Technology And Society",211,7.28,122


# Topic Modelling distribution

In [None]:
LDA = LDA_wrapper()

In [None]:
lda_model = LDA.restore_model("../../results/lda_model20t.p")
texts = df['text_clean'].values
data_vectorized, vectorizer = LDA.data_vectorizer(data=texts)

In [None]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
print_topics(lda_model, vectorizer, 5)


selected topics: 
The LDAViz change the topics order, so we need to map them. 


|LDAViz | lda_model|
|---|--|
|1 | 18|
|2 | 6|
|3 | 16|
|4 | 11|
|5 | 19|
|6 | 7|
|7 | 16|
|8 | 9|
|9 | 8|
|10 | 14|
|11 | 10|
|12 | 12|
|13 | 5|
|14 | 13|
|15 | 2|
|16 | 4|
|17 | 3|
|18 | 0|
|19 | 15|
|20 | 1|



In [None]:
def get_topics(x):
    topic_1 = x[18]
    topic_2 = x[6]
    topic_3 = x[16]
    topic_4 = x[11]
    topic_5 = x[19]
    topic_6 = x[7]
    topic_7 = x[16]
    topic_8 = x[9]
    topic_9 = x[8]
    topic_10 = x[14]
    topic_11 = x[10]
    topic_12 = x[12]
    topic_13 = x[5]
    topic_14 = x[13]
    topic_15 = x[2]
    topic_16 = x[4]
    topic_17 = x[3]
    topic_18 = x[0]
    topic_19 = x[15]
    topic_20 = x[1]


    return{'topic_1':topic_1,
    'topic_2':topic_2,
    'topic_3':topic_3,
    'topic_4':topic_4,
    'topic_5':topic_5,
    'topic_6':topic_6,
    'topic_7':topic_7,
    'topic_8':topic_8,
    'topic_9':topic_9,
    'topic_10':topic_10,
    'topic_11':topic_11,
    'topic_12':topic_12,
    'topic_13':topic_13,
    'topic_14':topic_14,
    'topic_15':topic_15,
    'topic_16':topic_16,
    'topic_17':topic_17,
    'topic_18':topic_18,
    'topic_19':topic_19,
    'topic_20':topic_20}
    

In [None]:
topics_df = pd.DataFrame(list(df.apply(lambda x: get_topics(x.topic_dist), axis=1)))

In [None]:
topics_df[['publicationName','field']] = df[['publicationName','field']].reset_index(drop=True)

In [None]:
topics_summary = topics_df.groupby(['publicationName','field']).agg(mean)
topics_df.to_csv('../../results/topics_dist_by_article.csv', index=False)