In [157]:
import csv
import numpy as np
from collections import defaultdict
import networkx as nx
import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import networkx.algorithms.community as nx_comm
from itertools import combinations


In [158]:
# read in graphs
papers = ['Baghizadhe2019', 'Darcy2011', 'Günther2017','Moeini2019', 
          'Oehlhorn2020', 'Peireira2020', 'Picollo2005', 'Schneider2014',
          'Siponen2004', 'Siponen2020', 'Teubner2020', 'Tsai2017',
          'Wiener2020', 'XiaoXiao2013']

whole_table = []
for name in papers:
    print(name + ' - loading')
    dc_graph = nx.read_gexf("./graphs/" + name + "_dc.gexf")
    bc_graph = nx.read_gexf("./graphs/" + name + "_bc.gexf")
    
    # general
    
    # list of all nodes
    all_nodes = list(dc_graph.nodes)
    
    # number of all_nodes
    n_all_nodes = dc_graph.number_of_nodes()

    # list of core nodes
    core_nodes = []
    for node in all_nodes:
        if dc_graph.out_degree(node) > 0:
            core_nodes.append(node)
            
    # number of core nodes
    n_core_nodes = len(core_nodes)
    
    # all node statistics are ignored given the limited scope
    if False:
        # all nodes time distribution
        publication_year_all_nodes = []
        for node in all_nodes:
            node_year = int(node[-4:])
            publication_year_all_nodes.append(node_year)
        publication_year_all_nodes = [i for i in publication_year_all_nodes if i != 0]
        standard_deviation_years_all_nodes = np.std(publication_year_all_nodes)
        mean_years_all_nodes = np.mean(publication_year_all_nodes)
    
    # core nodes time distribution
    publication_year_core_nodes = []
    for core_node in core_nodes:
        core_node_year = int(core_node[-4:])
        publication_year_core_nodes.append(core_node_year)
    standard_deviation_years_core_nodes = np.std(publication_year_core_nodes)
    mean_years_core_nodes = np.mean(publication_year_core_nodes)

    # direct citation

    # core paper density
    dc_core_node_graph = nx.subgraph(dc_graph, core_nodes)
    dc_core_node_density = nx.density(dc_core_node_graph) * 2
    
    # complete density
    dc_all_node_density = nx.density(dc_graph) * 2

    # avg out-degree core nodes - avg core node reference count
    dc_core_out_degree = sum(j for i, j in dc_graph.out_degree(core_nodes)) / len(core_nodes)

    # avg citation overlap - something is fishy
    def convert_tuple (tuple_list):
        converted_list = []
        for x,y in tuple_list:
            converted_list.append(y)
        return converted_list
    
    share_sum = 0
    for key_1,key_2 in combinations(core_nodes,2):
        if key_1 == key_2:
            continue
        key_1_references = len(list(dc_graph.edges([key_1])))
        key_2_references = len(list(dc_graph.edges([key_2])))
        key_1_list = convert_tuple(dc_graph.edges([key_1]))
        key_2_list = convert_tuple(dc_graph.edges([key_2]))
        shared_references = len([x for x in key_1_list if x in key_2_list])
        total_references = key_1_references + key_2_references - shared_references
        share_sum += shared_references / total_references
    dc_average_citation_overlap = share_sum * (2/(len(core_nodes)*(len(core_nodes)-1)))
    
    # number of connected components
    subgraph_components = list(nx.connected_components(dc_graph.to_undirected()))
    largest_component_node_list = max(subgraph_components, key=len)
    unconnected_nodes = []
    for component in subgraph_components:
        if len(component) < len(largest_component_node_list) :
            for node in component:
                unconnected_nodes.append(node)
    
    dc_number_of_components = len(subgraph_components)
    # number of unconnected parent nodes
    dc_number_unconnected_core_nodes = len([x for x in core_nodes if x in unconnected_nodes])

    # bibliographic coupling
    
    # density
    bc_density = nx.density(bc_graph)
    
    # For Schneider2014 and Xiaxiao2013, 2 nodes are not connected
    # these 2 nodes are ignored for the diameter, avg-path length
    bc_number_of_components = nx.number_connected_components(bc_graph)
    if bc_number_of_components > 1:
        print(name + '- bc graph not fully connected')
        subgraph_components = list(nx.connected_components(bc_graph))
        largest_component_node_list = max(subgraph_components, key=len)
        bc_graph = bc_graph.subgraph(largest_component_node_list)
    
    
    # average shortest path
    bc_average_shortest_path = nx.average_shortest_path_length(bc_graph)
    
    # diameter
    bc_diameter = nx.diameter(bc_graph)
    
    
    
    whole_table.append([n_core_nodes, standard_deviation_years_core_nodes, mean_years_core_nodes, dc_core_out_degree, dc_core_node_density, dc_average_citation_overlap, dc_number_unconnected_core_nodes,
                        n_all_nodes, dc_all_node_density, dc_number_of_components, bc_density, bc_average_shortest_path, bc_diameter])
    

Baghizadhe2019 - loading
Darcy2011 - loading
Günther2017 - loading
Moeini2019 - loading
Oehlhorn2020 - loading
Peireira2020 - loading
Picollo2005 - loading
Schneider2014 - loading
Schneider2014- bc graph not fully connected
Siponen2004 - loading
Siponen2020 - loading
Teubner2020 - loading
Tsai2017 - loading
Wiener2020 - loading
XiaoXiao2013 - loading
XiaoXiao2013- bc graph not fully connected


In [159]:
df = pd.DataFrame(whole_table, columns = [
                                            'Core Nodes', 'σ Publication Years', '⌀ Publication Years', '⌀ References', 'Core Node Density' ,
                                            '⌀ Reference Overlap', 'Unconnected Core Nodes',
                                            'Total Nodes', 'Total Density', 'Components',
                                            'Density', 'Average Shortest Path', 'Diameter', 
                                        ], index = papers)


# adding clustering values manually from gephi files
bc_modularity = [0.429, 0.237, 0.266, 0.334, 0.250, 0.504, 0.253, 0.257, 0.125, 0.274, 0.224, 0.155, 0.336, 0.435]
bc_n_cluster = [5, 3, 6, 7, 5, 10, 6, 5, 3, 5, 5, 4, 6, 6]
dc_modularity = [0.809, 0.706, 0.773, 0.673, 0.680, 0.878, 0.737, 0.694, 0.718, 0.610, 0.668, 0.624, 0.881, 0.912] 
dc_n_cluster = [32, 11, 28, 20, 25, 43, 22, 26, 13, 22, 16, 20, 33, 33]

df.insert(10, 'Modularity DC', dc_modularity)
df.insert(11, 'Clusters DC', dc_n_cluster)
df.insert(15, 'Modularity BC', bc_modularity)
df.insert(16, 'Clusters BC', bc_n_cluster)

# print and round all
df = df.round(decimals=pd.Series([0, 2, 2, 2, 3, 3, 0, 0, 4, 0, 3, 0, 3, 2, 0, 3, 0], index=df.columns))
df.round(5)


Unnamed: 0,Core Nodes,σ Publication Years,⌀ Publication Years,⌀ References,Core Node Density,⌀ Reference Overlap,Unconnected Core Nodes,Total Nodes,Total Density,Components,Modularity DC,Clusters DC,Density,Average Shortest Path,Diameter,Modularity BC,Clusters BC
Baghizadhe2019,92,6.27,2006.76,46.27,0.039,0.007,1,3317,0.0008,2,0.809,32,0.253,1.88,4,0.429,5
Darcy2011,15,6.08,2005.4,45.2,0.267,0.026,1,536,0.0047,2,0.706,11,0.791,1.23,3,0.237,3
Günther2017,67,0.81,2014.6,42.4,0.015,0.011,2,2248,0.0011,3,0.773,28,0.464,1.6,4,0.266,6
Moeini2019,75,5.75,2002.32,30.96,0.114,0.025,1,1462,0.0022,2,0.673,20,0.489,1.56,4,0.334,7
Oehlhorn2020,70,7.97,2005.23,45.23,0.094,0.014,3,2143,0.0014,4,0.68,25,0.541,1.46,3,0.25,5
Peireira2020,107,4.3,2012.85,28.91,0.011,0.004,12,2697,0.0009,13,0.878,43,0.169,2.17,5,0.504,10
Picollo2005,56,5.1,1995.77,39.25,0.053,0.014,2,1673,0.0016,3,0.737,22,0.461,1.57,3,0.253,6
Schneider2014,88,6.08,2005.76,37.57,0.06,0.011,1,2281,0.0013,2,0.694,26,0.392,1.63,4,0.257,5
Siponen2004,20,4.47,1996.7,20.35,0.111,0.017,2,332,0.0074,3,0.718,13,0.456,1.61,3,0.125,3
Siponen2020,131,4.82,2011.27,39.4,0.115,0.025,0,2957,0.0012,1,0.61,22,0.697,1.31,3,0.274,5


In [160]:
# create different tables from whole df
df_all_columns = df.columns

df_core_columns = df_all_columns[:7]
df_direct_citation_columns = df_all_columns[7:12]
df_bibliographic_coupling_columns = df_all_columns[12:]

df_core = df.loc[:, df_core_columns]
df_direct_citation = df.loc[:, df_direct_citation_columns]
df_bib_coupling = df.loc[:, df_bibliographic_coupling_columns]


In [161]:
# to latex
print(df_core.to_latex(index=True)) 
print(df_direct_citation.to_latex(index=True))  
print(df_bib_coupling.to_latex(index=True))  



\begin{tabular}{lrrrrrrr}
\toprule
{} &  Core Nodes &  σ Publication Years &  ⌀ Publication Years &  ⌀ References &  Core Node Density &  ⌀ Reference Overlap &  Unconnected Core Nodes \\
\midrule
Baghizadhe2019 &  92 &  6.27 &  2006.76 &  46.27 &  0.039 &  0.007 &  1 \\
Darcy2011      &  15 &  6.08 &  2005.40 &  45.20 &  0.267 &  0.026 &  1 \\
Günther2017    &  67 &  0.81 &  2014.60 &  42.40 &  0.015 &  0.011 &  2 \\
Moeini2019     &  75 &  5.75 &  2002.32 &  30.96 &  0.114 &  0.025 &  1 \\
Oehlhorn2020   &  70 &  7.97 &  2005.23 &  45.23 &  0.094 &  0.014 &  3 \\
Peireira2020   &  107 &  4.30 &  2012.85 &  28.91 &  0.011 &  0.004 &  12 \\
Picollo2005    &  56 &  5.10 &  1995.77 &  39.25 &  0.053 &  0.014 &  2 \\
Schneider2014  &  88 &  6.08 &  2005.76 &  37.57 &  0.060 &  0.011 &  1 \\
Siponen2004    &  20 &  4.47 &  1996.70 &  20.35 &  0.111 &  0.017 &  2 \\
Siponen2020    &  131 &  4.82 &  2011.27 &  39.40 &  0.115 &  0.025 &  0 \\
Teubner2020    &  33 &  2.37 &  2013.88 &  65.09 & 

In [171]:
# correlations
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

correlation_matrix = df.corr(method='pearson')
correlation_list = correlation_matrix.unstack()

# Number of Core Nodes
core_nodes_correlation = correlation_list.loc['Core Nodes']
core_nodes_correlation_ranked = core_nodes_correlation.reindex(core_nodes_correlation.abs().sort_values(ascending=False).index)
core_nodes_correlation_frame = core_nodes_correlation_ranked.to_frame(name="Pearson Correlation")

# σ Publication Years
publication_year_deviation_correlation = correlation_list.loc['σ Publication Years']
publication_year_deviation_correlation_ranked = publication_year_deviation_correlation.reindex(publication_year_deviation_correlation.abs().sort_values(ascending=False).index)
publication_year_deviation_correlation_frame = publication_year_deviation_correlation_ranked.to_frame(name="Pearson Correlation")

# x͂ Publication Years
publication_year_mean_correlation = correlation_list.loc['⌀ Publication Years']
publication_year_mean_correlation_ranked = publication_year_mean_correlation.reindex(publication_year_mean_correlation.abs().sort_values(ascending=False).index)
publication_year_mean_correlation_frame = publication_year_mean_correlation_ranked.to_frame(name="Pearson Correlation")

# x͂ References
core_node_reference_correlation = correlation_list.loc['⌀ References']
core_node_reference_correlation_ranked = core_node_reference_correlation.reindex(core_node_reference_correlation.abs().sort_values(ascending=False).index)
core_node_reference_correlation_frame = core_node_reference_correlation_ranked.to_frame(name="Pearson Correlation")

print(core_nodes_correlation_frame.to_latex(index=True)) 
print(publication_year_deviation_correlation_frame.to_latex(index=True)) 
print(publication_year_mean_correlation_frame.to_latex(index=True))  
print(core_node_reference_correlation_frame.to_latex(index=True))  


#ranked_correlation_list = correlation_list.reindex(correlation_list.abs().sort_values(ascending=True).index)
#print(ranked_correlation_list[-190:-20:2])

\begin{tabular}{lr}
\toprule
{} &  Pearson Correlation \\
\midrule
Core Nodes             &  1.000000 \\
Total Nodes            &  0.861168 \\
Total Density          & -0.648297 \\
Clusters DC            &  0.575108 \\
Clusters BC            &  0.546229 \\
Modularity BC          &  0.503340 \\
Core Node Density      & -0.384697 \\
Diameter               &  0.377580 \\
⌀ Publication Years    &  0.316945 \\
Density                & -0.310647 \\
⌀ Reference Overlap    & -0.247556 \\
Average Shortest Path  &  0.238810 \\
⌀ References           & -0.164275 \\
Components             &  0.157838 \\
σ Publication Years    &  0.157562 \\
Unconnected Core Nodes &  0.141075 \\
Modularity DC          & -0.012458 \\
\bottomrule
\end{tabular}

\begin{tabular}{lr}
\toprule
{} &  Pearson Correlation \\
\midrule
σ Publication Years    &  1.000000 \\
⌀ Publication Years    & -0.595004 \\
Core Node Density      &  0.469994 \\
Modularity DC          & -0.389226 \\
Clusters BC            & -0.223332 \\
Unc