# Data analysis with Neo4j and python for RESIDE_IN data

## Imports

In [None]:
from py2neo import Graph, Database , NodeMatcher, RelationshipMatcher

In [None]:
from neo4j import GraphDatabase

In [None]:
from tabulate import tabulate

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

## Conect to database:

In [None]:
uri = 'bolt://neo4j-hdx:7687'

## Py2neo

In [None]:
default_db = Database(uri, auth=("neo4j", "test"))

In [None]:
default_db.name

Default graph in the database:

In [None]:
default_db.default_graph.name

Instanciate a Graph object conecting to the default graph 'data' exposed in 'graphHDX.db':

In [None]:
graph = Graph(uri, auth=("neo4j", "test"))

In [None]:
graph.database.name

Get the number of nodes in the graph:

In [None]:
len(graph.nodes)

Get the number of relationship in the graph:

In [None]:
len(graph.relationships)

Explore the Schema of the graph:

 - Nodes

In [None]:
graph.schema.node_labels

 - Relationships

In [None]:
rel_types = graph.schema.relationship_types

In [None]:
rel_types

Get all years with available data from the relationship_types:

In [None]:
years_list = [rel for rel in graph.schema.relationship_types if rel.isdigit()]
years_list.sort()

In [None]:
years_list

Get all possible countries with all years that have data available:

In [None]:
nodes_country_year_list = list(graph.nodes.match("CountryYear"))

The total number of the combinations between countries and years with data available is:

In [None]:
len(nodes_country_year_list)

Let's define a funtion to print in table format all countries with the available WB Indicators for a given year:

In [None]:
def print_table_country_indicators_given_year(year, nodes_country_year):
    
    print(tabulate([{ key : element[key] for key in ["year", 
                                                     "country",
                                                     "population",
                                                     "pop_growth_percentage", 
                                                     "urban_pop_percentage", 
                                                     "int_migrant_stock" ] } for element in nodes_country_year if element["year"] == year], 
                       headers="keys", 
                       tablefmt='fancy_grid',
                       stralign='center',
                       floatfmt='.0f'))

In [None]:
print_table_country_indicators_given_year(2017, nodes_country_year_list)

In [None]:
print_table_country_indicators_given_year(2013, nodes_country_year_list)

Using NodeMatcher() class

In [None]:
matcher_node = NodeMatcher(graph)

In [None]:
matcher_node.match("CountryYear", year=2009).first()

Using RelationshipMatcher() class

In [None]:
matcher_rel = RelationshipMatcher(graph)

In [None]:
matcher_rel.match(r_type="RESIDE_IN").first()

## Neo4j driver

Conect to the database using the driver class GraphDatabase 

In [None]:
driver = GraphDatabase.driver(uri, auth=("neo4j", "test"))

Define a query that creates a subgraph with data from a certain year and sums the property value affected_total of all relationships that exit a node, and orders by this value in descending mode returning the country and total_affected summed value of the firstfive results:

In [None]:
def top5_total_origin_from(tx, year):
    return tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->() "
                         "WHERE a.year={0} " 
                         "RETURN a.country as country, a.year as year, sum(r.affected_total) as affectedTotal, sum(r.affected_refugees) as affectedRefugees "
                         "ORDER BY affectedTotal DESC LIMIT 5 ".format(year)) 

In [None]:
def top5_total_reside_in(tx, year):
    return tx.run("MATCH (a:CountryYear)<-[r:RESIDE_IN]-() "
                         "WHERE a.year={0} " 
                         "RETURN a.country as country, a.year as year, sum(r.affected_total) as affectedTotal, sum(r.affected_refugees) as affectedRefugees "
                         "ORDER BY affectedTotal DESC LIMIT 5 ".format(year)) 

In [None]:
with driver.session() as session:
    out = []
    for year in years_list:
        [out.append(element) for element in session.read_transaction(top5_total_reside_in, year).data()]

In [None]:
types = [data['country'] for data in out]
x_coords = [data['year'] for data in out]
y_coords = [data['affectedTotal'] for data in out]

fig= plt.figure(figsize=(20,15))

for i,type in enumerate(types):
    x = x_coords[i]
    y = y_coords[i]
    plt.scatter(x, y, marker='x', color='red')
    plt.text(x+0.3, y+0.3, type, fontsize=9)
    
plt.show()

In [None]:
def print_total_origin_from(tx):
    for record in tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->() "
                         "WHERE a.year=2016 " 
                         "RETURN a.country, sum(r.affected_total) as totalAffected, sum(r.affected_refugees) as refugeesAffected "
                         "ORDER BY totalAffected DESC LIMIT 5 "): 
        print(record)

In [None]:
with driver.session() as session:
    session.read_transaction(print_total_origin_from)

Similar query as the previous one but now the year is a variable . When calling the driver session it will go through all values in the list year_list with all possible years and return the values accordingly: 

In [None]:
def print_total_reside_in(tx,year):
    for record in tx.run("MATCH ()-[r:RESIDE_IN]->(b:CountryYear) "
                         "WHERE b.year={0} " 
                         "RETURN b.country, sum(r.affected_total) as totalAffected "
                         "ORDER BY totalAffected DESC LIMIT 5 ".format(year)): 
        print(record)

In [None]:
with driver.session() as session:
    for year in years_list:
        print(year)
        session.read_transaction(print_total_reside_in,year)

Given year and country of residence, obtain the number of people that request asylum and their country of origin:

In [None]:
def print_origin_countries_given_residence_and_year(tx,year,country_residence):
    for record in tx.run("MATCH (a:CountryYear)-[r:RESIDE_IN]->(b:CountryYear) "
                         "WHERE a.year={0} AND b.year={0} AND b.country={1} " 
                         "RETURN a.country, r.affected_total, r.affected_refugees,  r.affected_asylum "
                         "ORDER BY r.affected_total DESC".format(year,country_residence)): print(record)

In [None]:
year = 2017
country_residence = '"Spain"'

with driver.session() as session:
    session.read_transaction(print_origin_countries_given_residence_and_year,year,country_residence)

Given year and country of origin, obtain the number of people that request asylum and their country of residence:

In [None]:
def print_residence_countries_given_origin_and_year(tx,year,country_origin):
    for record in tx.run("MATCH (a:CountryYear)<-[r:RESIDE_IN]-(b:CountryYear) "
                         "WHERE a.year={0} AND b.year={0} AND b.country={1} " 
                         "RETURN a.country, r.affected_total, r.affected_refugees,  r.affected_asylum "
                         "ORDER BY r.affected_total DESC".format(year,country_origin)): print(record)

In [None]:
year = 2017
country_origin = '"United States of America"'

with driver.session() as session:
    session.read_transaction(print_residence_countries_given_origin_and_year,year,country_origin)

## Export subgraph

to graphml

In [None]:
CALL apoc.export.graphml.query("MATCH p=()<-[r:RESIDE_IN]-(n) WHERE n.country='Spain' AND n.year=2017 RETURN p","./data/prueba.graphml",{})

to json