In [1]:
import pandas as pd
import numpy as np

import os

import  csv

from time import sleep
from timeit import default_timer as timer

# custom general helper functions for this project
import custom_utils as cu
import importlib

In [2]:
# reload imports as needed
importlib.reload(cu);

In [3]:
# set up Pandas options
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 3)
pd.options.display.float_format = '{:.2f}'.format

In [4]:
from py2neo import authenticate, Graph, Node, Relationship

In [5]:
# To avoid typing neo4j password into the notebook each time,
# I'm saving it in a separate file and reading it in with the helper function below.
def read_n4jpass():
    """Reads neo4j connection credentials from .n4jpass file in current folder.
    Expects one value per line, ignores comments, e.g.:
    # comments here
    user=neo4j
    password=secretStuff123
    """
    
    cur_folder = os.getcwd()
    
    with open(cur_folder + '/.n4jpass', 'r') as f:
        lines = f.readlines()

    d = {}
    for l in lines:
        if l.strip() and (l[0] != '#'):
            k, v = l.strip().split('=')
            d[k] = v

    return d

In [6]:
n4j_cred = read_n4jpass()

In [7]:
# set up authentication parameters
authenticate("localhost:7474", n4j_cred["user"], n4j_cred["password"])

In [8]:
# connect to authenticated graph database
graph = Graph("http://localhost:7474/db/data/")

In [9]:
# test query
r = graph.data('CALL db.indexes;')
pd.DataFrame(r)

Unnamed: 0,description,failureMessage,id,indexName,progress,properties,provider,state,tokenNames,type
0,INDEX ON :Article(title),,1,index_1,100.0,[title],"{'version': '1.0', 'key': 'native-btree'}",ONLINE,[Article],node_unique_property


### Network descriptive statistics

In [None]:
# number of nodes (cypher query)
match (a:Article)
 return count(a)
# output: 5185699

In [10]:
q = """
    match (a:Article)
    return count(a);
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,count(a)
0,5185699


In [12]:
# number of directional edges
q = """
    match ()-[r]->()
    return count(r);
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,count(r)
0,18361180


In [18]:
# total traffic volume in the graph
q = """
    match ()-[r]->()
    return sum(r.traffic);
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,sum(r.traffic)
0,1737027807


In [13]:
# number of directional LINK_TO edges
q = """
    match ()-[r:LINK_TO]->()
    return count(r);
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,count(r)
0,17851501


In [19]:
# link traffic volume in the graph
q = """
    match ()-[r:LINK_TO]->()
    return sum(r.traffic);
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,sum(r.traffic)
0,1718277936


In [14]:
# number of directional SEARCH_FOR edges
q = """
    match ()-[r:SEARCH_FOR]->()
    return count(r);
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,count(r)
0,509679


In [20]:
# search traffic volume in the graph
q = """
    match ()-[r:SEARCH_FOR]->()
    return sum(r.traffic);
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,sum(r.traffic)
0,18749871


In [16]:
# number of reciprocal LINK_TO/SEARCH_FOR relationships
q = """
    match p=(a)-[:LINK_TO]->(b)-[:SEARCH_FOR]->(a)
    return count(p);
 """
r = graph.data(q)
pd.DataFrame(r)


Unnamed: 0,count(p)
0,85226


In [21]:
# number of reciprocal LINK_TO relationships
q = """
    match p=(a)-[r1:LINK_TO]->(b)-[r2:LINK_TO]->(a)
    return count(p)/2;
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,count(p)/2
0,2959959


In [22]:
# number of reciprocal SEARCH_FOR relationships
q = """
    match p=(a)-[r1:SEARCH_FOR]->(b)-[r2:SEARCH_FOR]->(a)
    return count(p)/2;
 """
r = graph.data(q)
pd.DataFrame(r)

Unnamed: 0,count(p)/2
0,33236


### Calculate degrees

In [None]:
# cypher-shell query to calculate degrees and save them on nodes
MATCH (a:Article)
SET a.search_in_degree = size((a)<-[:SEARCH_FOR]-()),
    a.search_out_degree = size((a)-[:SEARCH_FOR]->()),
    a.link_in_degree = size((a)<-[:LINK_TO]-()),
    a.link_out_degree = size((a)-[:LINK_TO]->())
;
# 0 rows available after 75292 ms, consumed after another 0 ms
# Set 20742796 properties


In [None]:
# cypher-shell queries to calculate traffic volumes per node and save them on nodes

# in-search traffic
MATCH (a:Article)<-[si:SEARCH_FOR]-()
WITH a, SUM(si.traffic) AS s
SET a.search_in_traffic = s
;
# 0 rows available after 12231 ms, consumed after another 0 ms
# Set 249765 properties


# out-search traffic
MATCH (a:Article)-[so:SEARCH_FOR]->()
WITH a, SUM(so.traffic) AS s
SET a.search_out_traffic = s
;
# 0 rows available after 9980 ms, consumed after another 0 ms
# Set 237126 properties


# in-link traffic
MATCH (a:Article)<-[li:LINK_TO]-()
WITH a, SUM(li.traffic) AS s
SET a.link_in_traffic = s
;
# 0 rows available after 61065 ms, consumed after another 0 ms
# Set 2598301 properties



# out-link traffic
MATCH (a:Article)-[lo:LINK_TO]->()
WITH a, SUM(lo.traffic) AS s
SET a.link_out_traffic = s
;
# 0 rows available after 59575 ms, consumed after another 0 ms
# Set 1703704 properties

