# Data analysis with Neo4j and python for RESIDE_IN data

Imports:

In [1]:
from py2neo import Graph, Database , NodeMatcher

In [12]:
from neo4j import GraphDatabase

Conect to database:

In [13]:
uri = 'bolt://neo4j-hdx:7687'

### Py2neo

In [2]:
default_db = Database(uri, auth=("neo4j", "test"))

In [3]:
default_db.name

'graphHDX.db'

Default graph in the database:

In [4]:
default_db.default_graph.name

'data'

Instanciate a Graph object conecting to the gdefault graph 'data' exposed in 'graphHDX.db':

In [5]:
graph = Graph(uri, auth=("neo4j", "test"))

In [6]:
graph.database.name

'graphHDX.db'

Get the number of nodes in the graph:

In [7]:
len(graph.nodes)

7376

Get the number of relationship in the graph:

In [8]:
len(graph.relationships)

124501

Explore the Schema of the graph:

In [9]:
graph.schema.node_labels

frozenset({'Country', 'CountryYear'})

In [10]:
rel_types = graph.schema.relationship_types
rel_types

frozenset({'1960',
           '1961',
           '1962',
           '1963',
           '1964',
           '1965',
           '1966',
           '1967',
           '1968',
           '1969',
           '1970',
           '1971',
           '1972',
           '1973',
           '1974',
           '1975',
           '1976',
           '1977',
           '1978',
           '1979',
           '1980',
           '1981',
           '1982',
           '1983',
           '1984',
           '1985',
           '1986',
           '1987',
           '1988',
           '1989',
           '1990',
           '1991',
           '1992',
           '1993',
           '1994',
           '1995',
           '1996',
           '1997',
           '1998',
           '1999',
           '2000',
           '2001',
           '2002',
           '2003',
           '2004',
           '2005',
           '2006',
           '2007',
           '2008',
           '2009',
           '2010',
           '2011',
           '

Get all years with available data from the relationship_types:

In [11]:
years_list = [rel for rel in graph.schema.relationship_types if rel.isdigit()]
years_list.sort()
years_list

['1974',
 '1970',
 '2010',
 '2002',
 '2006',
 '2016',
 '1967',
 '1979',
 '1969',
 '1975',
 '1961',
 '1960',
 '2013',
 '1995',
 '1998',
 '2000',
 '1984',
 '1971',
 '1966',
 '2008',
 '2005',
 '1991',
 '1988',
 '2015',
 '1994',
 '1968',
 '1978',
 '1992',
 '1985',
 '1993',
 '2017',
 '2001',
 '1996',
 '1989',
 '1982',
 '1977',
 '1986',
 '2011',
 '1964',
 '2004',
 '1973',
 '1962',
 '2007',
 '1983',
 '2009',
 '1972',
 '1980',
 '1990',
 '1981',
 '2012',
 '1997',
 '1976',
 '1987',
 '1965',
 '2014',
 '1999',
 '2003',
 '1963']

In [49]:
list(graph.nodes.match("CountryYear", year=2009))[0:5]

[(_17:CountryYear {country: 'Netherlands', countryearId: 'Netherlands2009', pop_growth_percentage: 0.5142845511436462, population: 16530388.0, urban_pop_percentage: 86.28800201416016, year: 2009}),
 (_38:CountryYear {country: 'Antigua and Barbuda', countryearId: 'Antigua and Barbuda2009', pop_growth_percentage: 1.1856592893600464, population: 93581.0, urban_pop_percentage: 26.81800079345703, year: 2009}),
 (_69:CountryYear {country: 'Cabo Verde', countryearId: 'Cabo Verde2009', pop_growth_percentage: 1.0600026845932007, population: 496963.0, urban_pop_percentage: 61.01599884033203, year: 2009}),
 (_93:CountryYear {country: 'Mauritius', countryearId: 'Mauritius2009', pop_growth_percentage: 0.26553767919540405, population: 1247429.0, urban_pop_percentage: 41.66600036621094, year: 2009}),
 (_144:CountryYear {country: 'Ethiopia', countryearId: 'Ethiopia2009', pop_growth_percentage: 2.6470654010772705, population: 85416256.0, urban_pop_percentage: 16.90999984741211, year: 2009})]

In [36]:
matcher = NodeMatcher(graph)

In [41]:
matcher.match("CountryYear", year=2009).first()

(_17:CountryYear {country: 'Netherlands', countryearId: 'Netherlands2009', pop_growth_percentage: 0.5142845511436462, population: 16530388.0, urban_pop_percentage: 86.28800201416016, year: 2009})

### Neo4j driver

Conect to the database using the driver class GraphDatabase 

In [21]:
driver = GraphDatabase.driver(uri, auth=("neo4j", "test"))

Define a query that creates a subgraph with data from a certain year and sums the property value affected_total of all relationships that exit a node, and orders by this value in descending mode returning the country and total_affected summed value of the firstfive results:

In [50]:
def print_total_origin_from(tx):
    for record in tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->(b:CountryYear) "
                         "WHERE a.year=2016 AND b.year=2016 " 
                         "RETURN a.country, sum(r.affected_total) as totalAffected, sum(r.affected_refugees) as refugeesAffected "
                         "ORDER BY totalAffected DESC LIMIT 5 "): print(record)

In [51]:
with driver.session() as session:
    session.read_transaction(print_total_origin_from)

<Record a.country='Syrian Arab Rep.' totalAffected=12643089.0 refugeesAffected=5524374.0>
<Record a.country='Colombia' totalAffected=7734587.0 refugeesAffected=311027.0>
<Record a.country='Iraq' totalAffected=5611595.0 refugeesAffected=316030.0>
<Record a.country='Afghanistan' totalAffected=5166125.0 refugeesAffected=2501445.0>
<Record a.country='South Sudan' totalAffected=4048612.0 refugeesAffected=1436719.0>


Simialr query as the previous one but now the year is a variable . When calling the driver session it will go through all values in the list year_list with all possible years and return the values accordingly: 

In [40]:
def print_total_reside_in(tx,year):
    for record in tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->(b:CountryYear) "
                         "WHERE a.year={0} AND b.year={1} " 
                         "RETURN b.country, sum(r.affected_total) as totalAffected "
                         "ORDER BY totalAffected DESC LIMIT 5 ".format(year,year)): print(record)

In [49]:
with driver.session() as session:
    for year in years_list:
        print(year)
        session.read_transaction(print_total_reside_in,year)

1960
<Record b.country='Dem. Rep. of the Congo' totalAffected=150000.0>
1961
<Record b.country='Dem. Rep. of the Congo' totalAffected=203000.0>
<Record b.country='Uganda' totalAffected=52000.0>
<Record b.country='Burundi' totalAffected=30000.0>
<Record b.country='United Rep. of Tanzania' totalAffected=12000.0>
<Record b.country='Togo' totalAffected=5000.0>
1962
<Record b.country='Dem. Rep. of the Congo' totalAffected=223000.0>
<Record b.country='Uganda' totalAffected=52000.0>
<Record b.country='Burundi' totalAffected=30000.0>
<Record b.country='United Rep. of Tanzania' totalAffected=12000.0>
<Record b.country='Togo' totalAffected=5000.0>
1963
<Record b.country='Dem. Rep. of the Congo' totalAffected=237000.0>
<Record b.country='Uganda' totalAffected=59000.0>
<Record b.country='Burundi' totalAffected=34000.0>
<Record b.country='United Rep. of Tanzania' totalAffected=12000.0>
<Record b.country='Senegal' totalAffected=6000.0>
<Record b.country='Togo' totalAffected=5000.0>
1964
<Record b.co