# Extract egonets


This Notebook uses address ids from southwestern German addresses found in the panama papers database in order to retrieve connected people as well as entities (companies, etc.). 
The data is stored in a Neo4j graph database, the data is retrieved via the python neo4j GraphDatabase driver. 
In the db, the graph pattern that is queried looks as follows:

    (address)-(person)-(entity)
    
People can have different relations with the same entity. Each distinct relation is queried and stored in a pandas df.

In [1]:
#imports
#create myconfig.py file with the credentials (uri, user, pwd) in order to access db
import os
import pandas as pd
import numpy as np
import csv
from neo4j import GraphDatabase
from myconfig import *

In [3]:
class Neo4jConnection:
    
    '''
    This class handles the connection to the Neo4j DBMS. 
    Source: https://towardsdatascience.com/neo4j-cypher-python-7a919a372be7
    
    Initialization variables 
    uri: URI used to connect to a database from the client
    user: DBMS username
    pwd: DBMS password
    
    Uri, user, and pwd are noted in a separate file called 'myconfig.py' and are imported.
    
    '''
    
    def __init__(self, uri, user, pwd):
        self.__uri=uri
        self.__user=user
        self.__pwd=pwd
        self.__driver = None
        
        # open driver
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
            
    def query(self, query, db=None):
        #check if driver is initialized
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        #try session creation and run query
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [4]:
def import_address_ids(datafile):
    with open(datafile) as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        data = [row for row in reader] # import rows
        flat_list = [item for sublist in data for item in sublist] # create list of vals from list of list with vals
        return flat_list

In [5]:
# change function to begin egonet search with people, not addresses
def construct_query_egonets(node_id):
    query = f"""
            MATCH (a:Address {{node_id: {node_id}}})-[r1]-(o:Officer)-[r2]-(u) 
            RETURN o.node_id as officer_node_id, o.name as officer, 
                r1.link as rel_location_officer, 
                a.node_id as local_address_id, a.address as local_address,
                r2.link as rel_officer_u,
                u.node_id as u_node_id, u.name as u_name, labels(u) as u_labels
            """
    return query

In [10]:
def get_egonets(address_ids, cols):
    egonets = pd.DataFrame(columns=cols) #create empty pandas df
    c = Neo4jConnection(uri, user, pwd) #initialize Connection object
    for address_id in address_ids: #for each address id 
        q = construct_query_egonets(address_id) #insert id into query string and save as query
        rows = pd.DataFrame([dict(_) for _ in c.query(q, db='neo4j')]) # run query in session and retrieve rows as pandas df
        egonets=pd.concat([egonets, rows]) # add rows to egonets df
    c.close()
    egonets = egonets.reset_index() # reset egonets index
    return egonets # return pandas df with all egonets found


In [7]:
# import address ids
l = import_address_ids("data/df_sus_DE.csv")

# retrieve egonet rows
head = ['officer_node_id', 'officer', 'rel_location_officer', 'local_address_id', 'local_address', 'rel_officer_u', 'u_node_id', 'u_name', 'u_labels']
dd = get_egonets(l, head)

# export distinct ids from people and entities found in queries as csv
people = dd[['officer_node_id', 'officer']].drop_duplicates().reset_index(drop=True)
entities = dd[["u_node_id", "u_name"]].drop_duplicates().reset_index(drop=True)
headers = ["node_id", "name"]
people.columns = headers
entities.columns = headers
fin = pd.concat([people, entities]).reset_index(drop=True)
prefix = "https://offshoreleaks.icij.org/nodes/"
fin["url"] = fin["node_id"].apply(lambda y: prefix+str(y))
fin.to_csv("data/node_ids.csv", index=False)