# Data analysis with Neo4j and python for RESIDE_IN data

Imports:

In [1]:
from py2neo import Graph, Database , NodeMatcher, RelationshipMatcher

In [2]:
from neo4j import GraphDatabase

Conect to database:

In [3]:
uri = 'bolt://neo4j-hdx:7687'

### Py2neo

In [35]:
default_db = Database(uri, auth=("neo4j", "test"))

In [36]:
default_db.name

'graphHDX.db'

Default graph in the database:

In [37]:
default_db.default_graph.name

'data'

Instanciate a Graph object conecting to the gdefault graph 'data' exposed in 'graphHDX.db':

In [38]:
graph = Graph(uri, auth=("neo4j", "test"))

In [39]:
graph.database.name

'graphHDX.db'

Get the number of nodes in the graph:

In [40]:
len(graph.nodes)

7376

Get the number of relationship in the graph:

In [41]:
len(graph.relationships)

124501

Explore the Schema of the graph:

In [42]:
graph.schema.node_labels

frozenset({'Country', 'CountryYear'})

In [43]:
rel_types = graph.schema.relationship_types
rel_types

frozenset({'1960',
           '1961',
           '1962',
           '1963',
           '1964',
           '1965',
           '1966',
           '1967',
           '1968',
           '1969',
           '1970',
           '1971',
           '1972',
           '1973',
           '1974',
           '1975',
           '1976',
           '1977',
           '1978',
           '1979',
           '1980',
           '1981',
           '1982',
           '1983',
           '1984',
           '1985',
           '1986',
           '1987',
           '1988',
           '1989',
           '1990',
           '1991',
           '1992',
           '1993',
           '1994',
           '1995',
           '1996',
           '1997',
           '1998',
           '1999',
           '2000',
           '2001',
           '2002',
           '2003',
           '2004',
           '2005',
           '2006',
           '2007',
           '2008',
           '2009',
           '2010',
           '2011',
           '

Get all years with available data from the relationship_types:

In [48]:
years_list = [rel for rel in graph.schema.relationship_types if rel.isdigit()]
years_list.sort()
years_list

['1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017']

Get all possible countries with all years that have data available:

In [61]:
nodes_country_year_list = list(graph.nodes.match("CountryYear"))
len(nodes_country_year_list)

7154

In [67]:
for element in nodes_country_year_list[0:100]:
    print(element["country"] + " had a total population of " + str(element["population"]) + " in " + str(element["year"]))

Netherlands had a total population of 15184166.0 in 1992
Netherlands had a total population of 15290368.0 in 1993
Netherlands had a total population of 15382838.0 in 1994
Netherlands had a total population of 15459006.0 in 1995
Netherlands had a total population of 15530498.0 in 1996
Netherlands had a total population of 15610650.0 in 1997
Netherlands had a total population of 15707209.0 in 1998
Netherlands had a total population of 15812088.0 in 1999
Netherlands had a total population of 15925513.0 in 2000
Netherlands had a total population of 16046180.0 in 2001
Netherlands had a total population of 16148929.0 in 2002
Netherlands had a total population of 16225302.0 in 2003
Netherlands had a total population of 16281779.0 in 2004
Netherlands had a total population of 16319868.0 in 2005
Netherlands had a total population of 16346101.0 in 2006
Netherlands had a total population of 16381696.0 in 2007
Netherlands had a total population of 16445593.0 in 2008
Netherlands had a total populat

Using NodeMatcher() class

In [68]:
matcher_node = NodeMatcher(graph)

In [69]:
matcher_node.match("CountryYear", year=2009).first()

(_17:CountryYear {country: 'Netherlands', countryearId: 'Netherlands2009', pop_growth_percentage: 0.5142845511436462, population: 16530388.0, urban_pop_percentage: 86.28800201416016, year: 2009})

Using RelationshipMatcher() class

In [73]:
matcher_rel = RelationshipMatcher(graph)

In [77]:
matcher_rel.match(r_type="RESIDE_IN").first()

(_2)-[:RESIDE_IN {affected_refugees: 75.0, affected_total: 75.0}]->(_4755)

### Neo4j driver

Conect to the database using the driver class GraphDatabase 

In [4]:
driver = GraphDatabase.driver(uri, auth=("neo4j", "test"))

Define a query that creates a subgraph with data from a certain year and sums the property value affected_total of all relationships that exit a node, and orders by this value in descending mode returning the country and total_affected summed value of the firstfive results:

In [33]:
def print_total_origin_from(tx):
    for record in tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->(b:CountryYear) "
                         "WHERE a.year=2016 AND b.year=2016 " 
                         "RETURN a.country, sum(r.affected_total) as totalAffected, sum(r.affected_refugees) as refugeesAffected "
                         "ORDER BY totalAffected DESC LIMIT 5 "): 
        print(record)

In [34]:
with driver.session() as session:
    session.read_transaction(print_total_origin_from)

<Record a.country='Syrian Arab Rep.' totalAffected=12643089.0 refugeesAffected=5524374.0>
<Record a.country='Colombia' totalAffected=7734587.0 refugeesAffected=311027.0>
<Record a.country='Iraq' totalAffected=5611595.0 refugeesAffected=316030.0>
<Record a.country='Afghanistan' totalAffected=5166125.0 refugeesAffected=2501445.0>
<Record a.country='South Sudan' totalAffected=4048612.0 refugeesAffected=1436719.0>


Similar query as the previous one but now the year is a variable . When calling the driver session it will go through all values in the list year_list with all possible years and return the values accordingly: 

In [46]:
def print_total_reside_in(tx,year):
    for record in tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->(b:CountryYear) "
                         "WHERE a.year={0} AND b.year={0} " 
                         "RETURN b.country, sum(r.affected_total) as totalAffected "
                         "ORDER BY totalAffected DESC LIMIT 5 ".format(year)): 
        print(record)

In [49]:
with driver.session() as session:
    for year in years_list:
        print(year)
        session.read_transaction(print_total_reside_in,year)

1960
<Record b.country='Dem. Rep. of the Congo' totalAffected=150000.0>
1961
<Record b.country='Dem. Rep. of the Congo' totalAffected=203000.0>
<Record b.country='Uganda' totalAffected=52000.0>
<Record b.country='Burundi' totalAffected=30000.0>
<Record b.country='United Rep. of Tanzania' totalAffected=12000.0>
<Record b.country='Togo' totalAffected=5000.0>
1962
<Record b.country='Dem. Rep. of the Congo' totalAffected=223000.0>
<Record b.country='Uganda' totalAffected=52000.0>
<Record b.country='Burundi' totalAffected=30000.0>
<Record b.country='United Rep. of Tanzania' totalAffected=12000.0>
<Record b.country='Togo' totalAffected=5000.0>
1963
<Record b.country='Dem. Rep. of the Congo' totalAffected=237000.0>
<Record b.country='Uganda' totalAffected=59000.0>
<Record b.country='Burundi' totalAffected=34000.0>
<Record b.country='United Rep. of Tanzania' totalAffected=12000.0>
<Record b.country='Senegal' totalAffected=6000.0>
1964
<Record b.country='China, Hong Kong SAR' totalAffected=1120

Given year and country of residence, obtain the number of people that request asylum and their country of origin:

In [80]:
def print_origin_countries_given_residence_and_year(tx,year,country_residence):
    for record in tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->(b:CountryYear) "
                         "WHERE a.year={0} AND b.year={0} AND b.country={1} " 
                         "RETURN a.country, r.affected_total, r.affected_refugees,  r.affected_asylum "
                         "ORDER BY r.affected_total DESC".format(year,country_residence)): print(record)

In [81]:
year = 2017
country_residence = '"Spain"'

with driver.session() as session:
    session.read_transaction(print_origin_countries_given_residence_and_year,year,country_residence)

<Record a.country='Syrian Arab Rep.' r.affected_total=14007.0 r.affected_refugees=11752.0 r.affected_asylum=2255.0>
<Record a.country='Venezuela (Bolivarian Republic of)' r.affected_total=12509.0 r.affected_refugees=44.0 r.affected_asylum=12465.0>
<Record a.country='Ukraine' r.affected_total=4834.0 r.affected_refugees=368.0 r.affected_asylum=4466.0>
<Record a.country='Colombia' r.affected_total=2662.0 r.affected_refugees=179.0 r.affected_asylum=2483.0>
<Record a.country='Palestinian' r.affected_total=2177.0 r.affected_refugees=862.0 r.affected_asylum=1315.0>
<Record a.country='El Salvador' r.affected_total=1508.0 r.affected_refugees=7.0 r.affected_asylum=1501.0>
<Record a.country='Honduras' r.affected_total=1408.0 r.affected_refugees=35.0 r.affected_asylum=1373.0>
<Record a.country='Cuba' r.affected_total=1085.0 r.affected_refugees=894.0 r.affected_asylum=191.0>
<Record a.country='Cameroon' r.affected_total=886.0 r.affected_refugees=47.0 r.affected_asylum=839.0>
<Record a.country='Soma

#### Algorithms

In [None]:
CALL algo.betweenness.stream(
'MATCH (n:CountryYear) WHERE n.year=2009 RETURN id(n) AS id',
'MATCH (n)-[r:RESIDE_IN]->(m) WHERE n.year=2016 and m.year=2016 RETURN id(n) AS source, id(m) AS target',
{graph: "cypher"})

YIELD nodeId, centrality

MATCH (CountryYear) WHERE id(CountryYear) = nodeId
RETURN CountryYear.countryearId AS countryyear,centrality
ORDER BY centrality DESC;