In [None]:
wiki_graph_usage_df = w2a_graph_sizes_df[['language', 'language_code', 'articles_count', 'external_sources_count', \
       'edges_count']]

w2a_graph_density_df['density'] = w2a_graph_density_df['edges_count'] * \
    1.0/( (w2a_graph_density_df['articles_count'] + w2a_graph_density_df['external_sources_count']) * \
          (w2a_graph_density_df['articles_count'] + w2a_graph_density_df['external_sources_count'] - 1) )

w2a_graph_density_df

In [None]:
def graph_summary(language_code):
    lang_dict = {'DE': 'German', 'EN': 'English', 'ES': 'Spanish', 'FA': 'Persian', 'RU': 'Russian', 'FR': 'French', \
                 'IT': 'Italian', 'JA': 'Japanese', 'PL': 'Polish', 'PT': 'Portuguese', 'ZH': 'Chinese'}
    articles_count = graph.data("""
        MATCH (n1:Article {language_code: {lang_code} }) 
        RETURN count(n1) as articles_count;
        """, lang_code = language_code)
    external_sources_count = graph.data("""
        MATCH (n1:ExternalSource {language_code: {lang_code} }) 
        RETURN count(n1) as external_sources_count;
        """, lang_code = language_code)
    
    edges_count = graph.data("""
        MATCH ()-[r1:REFERRED_TO]->(n1:Article {language_code: {lang_code} }) 
        RETURN count(r1) as edges_count;
        """, lang_code = language_code)
    
    references_count = graph.data("""
        MATCH ()-[r1:REFERRED_TO]->(n1:Article {language_code: {lang_code} }) 
        RETURN sum(toInt(r1.count)) as references_count;
        """, lang_code = language_code)
    
    # average degree
    degree_avg = edges_count[0]['edges_count']*1.0/(articles_count[0]['articles_count'] + \
                                                    external_sources_count[0]['external_sources_count'])
    # average weighted degree
    weighted_degree_avg = references_count[0]['references_count']*1.0/(articles_count[0]['articles_count'] + \
                                                                       external_sources_count[0]['external_sources_count'])
    
    # average references per edge
    refs_per_edge_avg = references_count[0]['references_count']*1.0/edges_count[0]['edges_count']
    
    d = dict({'language': lang_dict[language_code]}, **articles_count[0], **external_sources_count[0], \
            **edges_count[0], **references_count[0])
    d['degree_avg'] = degree_avg
    d['weighted_degree_avg'] = weighted_degree_avg
    d['refs_per_edge_avg'] = refs_per_edge_avg
    return d

In [None]:
lang_dict = {'DE': 'German', 'EN': 'English', 'ES': 'Spanish', 'FA': 'Persian', 'RU': 'Russian', 'FR': 'French', \
                 'IT': 'Italian', 'JA': 'Japanese', 'PL': 'Polish', 'PT': 'Portuguese', 'ZH': 'Chinese'}

graph_summaries_list = []
for lang in list(lang_dict.keys()):
    graph_summaries_list.append(graph_summary(lang))
    print("Finished " + lang)
    
graph_summaries_df = pd.DataFrame(graph_summaries_list)
graph_summaries_df = graph_summaries_df.set_index('language')
graph_summaries_df


In [None]:
graph_summaries_df.sort_values(by='articles_count', ascending=False)

In [None]:
# Not running this, because these graphs are too large and it would take forever,
# but this query illistrates graph diameter and how to compute it.
# graph_diameter_de = graph.data("""
#    MATCH (a:Article {language_code:'DE'}), (b:Article {language_code:'DE'}) WHERE id(a) > id(b)
#    MATCH p=shortestPath((a)-[:REFERRED_TO*]-(b))
#    RETURN length(p) AS len, extract(x IN nodes(p) | x.title) AS path
#    ORDER BY len DESC LIMIT 5
#    """)
#    graph_diameter_de

In [None]:
# the same network properties summary as above, but for the article2article subgraph
def articles_subgraph_summary(language_code):
    lang_dict = {'DE': 'German', 'EN': 'English', 'ES': 'Spanish', 'FA': 'Persian', 'RU': 'Russian', 'FR': 'French', \
                 'IT': 'Italian', 'JA': 'Japanese', 'PL': 'Polish', 'PT': 'Portuguese', 'ZH': 'Chinese'}
    
    # count all articles
    articles_count = graph.data("""
        MATCH (n1:Article {language_code: {lang_code} }) 
        RETURN count(n1) as articles_count;
        """, lang_code = language_code)
    
    # count articles that are connected to other articles
    a2a_nodes_count = graph.data("""
        MATCH (n1:Article {language_code: {lang_code} })-[r1:REFERRED_TO]-(n2:Article {language_code: {lang_code} }) 
        RETURN count(distinct n1) as a2a_nodes_count;
        """, lang_code = language_code)
    
    # count articles that are not connected to any other article
    # (users accessed such articles from external pages only, and did not go to another Wiki article from them)
    e2a_nodes_count = articles_count[0]['articles_count'] - a2a_nodes_count[0]['a2a_nodes_count']
    
    # A count of articles that link to other articles
    # (the a2a relationships that are not links are either internal wiki searches or spoofs)
    ala_nodes_count = graph.data("""
        MATCH (n1:Article {language_code: {lang_code} })-[r1:REFERRED_TO {type: 'link'}]-(n2:Article {language_code: {lang_code} }) 
        RETURN count(distinct n1) as ala_nodes_count;
        """, lang_code = language_code)
    
    # A count of articles that link to other articles
    asa_nodes_count = a2a_nodes_count[0]['a2a_nodes_count'] - ala_nodes_count[0]['ala_nodes_count']
    
    a2a_edges_count = graph.data("""
        MATCH (n1:Article {language_code: {lang_code} })-[r1:REFERRED_TO]->(n2:Article {language_code: {lang_code} }) 
        RETURN count(r1) as a2a_edges_count;
        """, lang_code = language_code)
    
    a2a_references_count = graph.data("""
        MATCH (n1:Article {language_code: {lang_code} })-[r1:REFERRED_TO]->(n2:Article {language_code: {lang_code} }) 
        RETURN sum(toInt(r1.count)) as a2a_references_count;
        """, lang_code = language_code)
    
    # average degree
    a2a_degree_avg = a2a_edges_count[0]['a2a_edges_count']*1.0/(a2a_nodes_count[0]['a2a_nodes_count'])
                                                        
    # average weighted degree
    a2a_weighted_degree_avg = a2a_references_count[0]['a2a_references_count']*1.0/(a2a_nodes_count[0]['a2a_nodes_count'])
    
    # average references per link
    a2a_refs_per_link_avg = a2a_references_count[0]['a2a_references_count']*1.0/a2a_edges_count[0]['a2a_edges_count']
    
    
    d = dict({'language': lang_dict[language_code]}, **articles_count[0], **a2a_nodes_count[0], \
            **ala_nodes_count[0], **a2a_edges_count[0], **a2a_references_count[0])
    d['e2a_nodes_count'] = e2a_nodes_count
    d['asa_nodes_count'] = asa_nodes_count
    d['a2a_degree_avg'] = a2a_degree_avg
    d['a2a_weighted_degree_avg'] = a2a_weighted_degree_avg
    d['a2a_refs_per_link_avg'] = a2a_refs_per_link_avg
    return d

In [None]:
a2a_subgraph_summaries_list = []
for lang in list(lang_dict.keys()):
    a2a_subgraph_summaries_list.append(articles_subgraph_summary(lang))
    print("Finished " + lang)
    
subgraph_summaries_df = pd.DataFrame(a2a_subgraph_summaries_list)
subgraph_summaries_df = subgraph_summaries_df.set_index('language')
subgraph_summaries_df = subgraph_summaries_df.sort_values(by='articles_count', ascending=False)
subgraph_summaries_df


In [None]:
# graph.data('CALL db.indexes;')