# Inserting Corporate PSCs

A significant number of controlling entities are the corporations and companies that control other companies. Here we load the data and prepare it for insertion into the graph.

In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np
from neo4j.v1 import GraphDatabase

In [2]:
original_psc_data = pd.read_json('../data/psc_snapshot-2017-09-08.json')
all_records_psc = pd.concat([original_psc_data['company_number'],json_normalize(original_psc_data['data'])],axis=1)
del original_psc_data

In [3]:
all_records_psc.head()

Unnamed: 0,company_number,address.address_line_1,address.address_line_2,address.care_of,address.country,address.locality,address.po_box,address.postal_code,address.premises,address.region,...,name_elements.middle_name,name_elements.surname,name_elements.title,nationality,natures_of_control,notified_on,persons_of_significant_control_count,restrictions_notice_withdrawal_reason,statement,statements_count
0,9145694,St. Andrews Road,,,England,Henley-On-Thames,,RG9 1HP,2,,...,Thanh,Wildman,Mrs,Vietnamese,[ownership-of-shares-50-to-75-percent],2016-04-06,,,,
1,8581893,High Street,Wendover,,England,Aylesbury,,HP22 6EA,14a,Buckinghamshire,...,Robert Charles,Davies,Mr,British,"[ownership-of-shares-25-to-50-percent, ownersh...",2016-06-30,,,,
2,8581893,Holywells Road,,,United Kingdom,Ipswich,,IP3 0DL,37-41,Suffolk,...,Fiona,Tarrant,,British,"[ownership-of-shares-25-to-50-percent, voting-...",2016-04-06,,,,
3,1605766,20-22 Wenlock Road,,,England,London,,N1 7GU,Suite Lp33221,,...,,,,,[ownership-of-shares-75-to-100-percent],2016-04-06,,,,
4,10259080,38 Church Road,Worcester Park,,,Surrey,,KT4 7RD,,,...,Peter,Ollett,Mr,British,[ownership-of-shares-50-to-75-percent],2016-06-30,,,,


In [4]:
all_records_psc.kind.value_counts()

individual-person-with-significant-control          4225140
persons-with-significant-control-statement           404603
corporate-entity-person-with-significant-control     344866
legal-person-person-with-significant-control           5490
super-secure-person-with-significant-control            186
exemptions                                               37
totals#persons-of-significant-control-snapshot            1
Name: kind, dtype: int64

## Filter the data to only handle corporate control

In [5]:
corporate_psc = all_records_psc[all_records_psc.kind == "corporate-entity-person-with-significant-control"]
del all_records_psc

In [6]:
corporate_psc.head()

Unnamed: 0,company_number,address.address_line_1,address.address_line_2,address.care_of,address.country,address.locality,address.po_box,address.postal_code,address.premises,address.region,...,name_elements.middle_name,name_elements.surname,name_elements.title,nationality,natures_of_control,notified_on,persons_of_significant_control_count,restrictions_notice_withdrawal_reason,statement,statements_count
3,01605766,20-22 Wenlock Road,,,England,London,,N1 7GU,Suite Lp33221,,...,,,,,[ownership-of-shares-75-to-100-percent],2016-04-06,,,,
5,10259079,Hilgrove Street,,,Jersey,St. Helier,,JE1 1ES,Queensway House,,...,,,,,"[ownership-of-shares-25-to-50-percent, voting-...",2016-04-06,,,,
11,01943460,,,,United Kingdom,Eastleigh,,SO50 6YU,Chickenhall Lane,Hampshire,...,,,,,[ownership-of-shares-25-to-50-percent],2016-04-06,,,,
13,05495577,12-18 Queens Road,,,England,Weybridge,,KT13 9XB,Clive House,Surrey,...,,,,,[ownership-of-shares-50-to-75-percent],2016-04-06,,,,
28,NI065219,Parkway,Porters Wood,,England,St. Albans,,AL3 6PA,5b,Hertfordshire,...,,,,,[ownership-of-shares-75-to-100-percent],2016-04-06,,,,


In [7]:
def convert_control_list(control_list):
    """Function to convert a list of controls into a dictionary with the controls as a key and a value of 1"""
    if isinstance(control_list, list):
        return {key: 1 for key in control_list}
    else:
        return {}

In [8]:
corporate_psc['DICTIONARY_OF_CONTROLS'] = corporate_psc['natures_of_control'].map(convert_control_list)

## It appears that the company_number does NOT correspond to the company_id for corporate PSCs
** INSTEAD we must extract it from the links.self path
e.g. links.self /company/07804290/persons-with-significant-control/corporate-entity/jc9QvYXKVQMFGOTWsXH-XVt25eU **

In [23]:
for k, v in corporate_psc[corporate_psc['identification.registration_number'].isnull()].iloc[1].items():
    print(k, v)

company_number 05868878
address.address_line_1 Navigo House
address.address_line_2 nan
address.care_of nan
address.country England
address.locality Grimsby
address.po_box nan
address.postal_code DN32 0QE
address.premises 3-7 Brighowgate
address.region nan
ceased nan
ceased_on nan
country_of_residence nan
date_of_birth.month nan
date_of_birth.year nan
description nan
etag 4886a53b47aebc6b9c8344009124d41f1523ac0b
exemptions.psc_exempt_as_shares_admitted_on_market.exemption_type nan
exemptions.psc_exempt_as_shares_admitted_on_market.items nan
exemptions.psc_exempt_as_trading_on_regulated_market.exemption_type nan
exemptions.psc_exempt_as_trading_on_regulated_market.items nan
exemptions_count nan
generated_at nan
identification.country_registered nan
identification.legal_authority Companies Act 2006
identification.legal_form Community Interest Company
identification.place_registered nan
identification.registration_number nan
kind corporate-entity-person-with-significant-control
linked_psc_

In [25]:
corporate_psc[corporate_psc.company_number == '07804290']

Unnamed: 0,company_number,address.address_line_1,address.address_line_2,address.care_of,address.country,address.locality,address.po_box,address.postal_code,address.premises,address.region,...,name_elements.surname,name_elements.title,nationality,natures_of_control,notified_on,persons_of_significant_control_count,restrictions_notice_withdrawal_reason,statement,statements_count,DICTIONARY_OF_CONTROLS


In [38]:
country_code_map = pd.read_pickle('./data/clean_country_code_map.pkl')
combined_map = pd.read_pickle('./data/combined_country_map.pkl')
nationality_map = pd.read_pickle('./data/nation_map.pkl')

### Defining specific functions to handle human PSCs

These functions will convert the raw data for human PSCs into a format that can then be inderted into Neo4j in an easy fashion.

In [32]:
def corporate_psc_name(record):
    "Function to create name string identifier for corporation"
    data = record['name']
    try:
        return data.upper()
    except TypeError as e:
        return 'NONAME__BLANK'
    except AttributeError:
        return 'NONAME__BLANK'
    
def corporate_psc_uid(record):
    "Function to create a unique ID for someone from their name and dob"
    uid = str(record['identification.registration_number'])
    if uid == 'nan':
        uid = '-'.join(corporate_psc_name(record).split()) 
        
    country = str(record['identification.country_registered'])
    if country == 'nan':
        country = record['address.country']
        if country == 'nan':
            country = 'UNKNOWN'
    if isinstance(country, tuple):
        country = 'UNKNOWN'
        
    return {'uid': uid,
            'registered_country': country,
            'legal_authority': str(record['identification.legal_authority']).replace('nan', 'MISSING'),
            'place_registered': str(record['identification.place_registered']).replace('nan', 'MISSING')}


def corporate_psc_address(record):
    "Function to create a address details for person"
    try:
        new_address = {
            'premises': str(record['address.premises']),
            'address_Line1': str(record['address.address_line_1']),
            'address_Line2': str(record['address.address_line_2']),
            'address_PostTown': str(record['address.locality']).upper(),
            'address_POBox': str(record['address.po_box']),
            'address_County': str(record['address.region']).upper(),
            'address_PostCode': str(record['address.postal_code']).upper(),
            'address_Country': str(record['address.country']).upper(),
            'address_CareOf': str(record['address.care_of'])
        }
        return {k: v.replace('nan', '').replace('NAN', '') for k,v in new_address.items()}
    except TypeError as e:
        return {'address_Country': "UNKNOWN"}
    except AttributeError:
        return {'address_Country': "UNKNOWN"}
    
    
def corporate_psc_details(record):
    "Function to create a relationship details for person"
    details = {}
    details['company_id'] = record['links.self'].split('/')[2]
    details['control_kind'] = record['kind']
    return details
    
    
def new_corporate_record(record):
    "Function to create a new record that is flat"
    uid = corporate_psc_uid(record)
    address = corporate_psc_address(record)
    return {**uid, **address, 
            **corporate_psc_details(record), 
            'name': corporate_psc_name(record), 
            'natures_of_control': record['DICTIONARY_OF_CONTROLS']}

Time to test some of these functions

In [35]:
corporate_psc.iloc[0:5].apply(lambda s: pd.Series(new_corporate_record(s)), axis=1)

Unnamed: 0,address_CareOf,address_Country,address_County,address_Line1,address_Line2,address_POBox,address_PostCode,address_PostTown,company_id,control_kind,legal_authority,name,natures_of_control,place_registered,premises,registered_country,uid
3,,ENGLAND,,20-22 Wenlock Road,,,N1 7GU,LONDON,7296272,corporate-entity-person-with-significant-control,Companies Act 2006,ANDERSON MANN GROUP LTD,{'ownership-of-shares-75-to-100-percent': 1},Companies House Uk,Suite Lp33221,England,7500933
5,,JERSEY,,Hilgrove Street,,,JE1 1ES,ST. HELIER,9804615,corporate-entity-person-with-significant-control,Companies (Jersey) Law 1991,GLENCORE PLC,"{'ownership-of-shares-25-to-50-percent': 1, 'v...",Register Of Jersey Companies,Queensway House,Jersey,107710
11,,UNITED KINGDOM,HAMPSHIRE,,,,SO50 6YU,EASTLEIGH,1493788,corporate-entity-person-with-significant-control,United Kingdom (England),PRYSMIAN CABLES & SYSTEMS LIMITED,{'ownership-of-shares-25-to-50-percent': 1},Companies House,Chickenhall Lane,England,958507
13,,ENGLAND,SURREY,12-18 Queens Road,,,KT13 9XB,WEYBRIDGE,1493788,corporate-entity-person-with-significant-control,United Kingdom (England),TT ELECTRONICS PLC,{'ownership-of-shares-50-to-75-percent': 1},Companies House,Clive House,England,87249
28,,ENGLAND,HERTFORDSHIRE,Parkway,Porters Wood,,AL3 6PA,ST. ALBANS,1605823,corporate-entity-person-with-significant-control,Companies Act,WOODLAND SOFTWARE SOLUTIONS LIMITED,{'ownership-of-shares-75-to-100-percent': 1},England & Wales,5b,England,3647273


In [39]:
neo_records_df = corporate_psc.head(10).apply(lambda s: pd.Series(new_corporate_record(s)), axis=1)
neo_records_df['Registered_in'] = neo_records_df.registered_country.apply(lambda x: combined_map.get(x.upper(), ''))

neo_records_df.tail(5)

Unnamed: 0,address_CareOf,address_Country,address_County,address_Line1,address_Line2,address_POBox,address_PostCode,address_PostTown,company_id,control_kind,legal_authority,name,natures_of_control,place_registered,premises,registered_country,uid,Registered_in
37,,ENGLAND,,The Airport,,,CB5 8RY,CAMBRIDGE,287379,corporate-entity-person-with-significant-control,England,S.G. SMITH AUTOMOTIVE LIMITED,"{'ownership-of-shares-75-to-100-percent': 1, '...",Registrar Of Companies (England & Wales),Airport House,England,622112,GB
47,,ENGLAND,,Olympus Avenue,,,CV34 6BF,LEAMINGTON SPA,7790030,corporate-entity-person-with-significant-control,Companies Act 2006,1846 NOMINEES LIMITED,"{'ownership-of-shares-75-to-100-percent': 1, '...",Companies House,Olympus House,England And Wales,5953366,GB
88,,ENGLAND,,Gracechurch Street,,,EC3V 0BT,LONDON,5955659,corporate-entity-person-with-significant-control,Companies Act 2006,UPP INVESTMENTS LIMITED,"{'ownership-of-shares-75-to-100-percent': 1, '...",Registrar Of Companies England And Wales,40,England,5957759,GB
100,,ENGLAND,,Victoria Embankment,,,EC4Y 0DZ,LONDON,4240399,corporate-entity-person-with-significant-control,United Kingdom Law,OCTOPUS PUBLISHING GROUP LIMITED,{'ownership-of-shares-75-to-100-percent': 1},Companies House,50,England And Wales,3597451,GB
121,,UNITED KINGDOM,HAMPSHIRE,Hussar Court,,,PO7 7SQ,WATERLOOVILLE,8228404,corporate-entity-person-with-significant-control,Companies House Act 2006,R & S YOUNG MANAGEMENT SERVICES LIMITED,"{'ownership-of-shares-75-to-100-percent': 1, '...",The Register Of Companies,24 Picton House,England And Wales,8227226,GB


## Now to insert the Corporate PSCs

Now we can loop over the larger set of data and insert all of the active PSCs

In [69]:
def write_corporate_psc_to_neo(input_data, driver):
    """Function writes corporate records to Neo4j database
    From a list of dictionaries construct the appropriate nodes and relationships to be inserted into Neo4j
    :param input_data - a list of dictionaries that have all the required information for nodes and relationships
    :param driver - an active driver object to connect to a neo4j instance
    :return """
    
    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MERGE (c:Company {uid: d.uid}) "
                     "ON CREATE SET c.name=d.name, "
                     "c.address_premises=d.premises, "
                     "c.address_Line1=d.address_Line1, "
                     "c.address_Line2=d.address_Line2, "
                     "c.address_PostTown=d.address_PostTown, "
                     "c.address_POBox=d.address_POBox, "
                     "c.address_County=d.address_County, "
                     "c.address_PostCode=d.address_Postcode, "                 
                     "c.address_Country=d.address_Country, "
                     "c.legal_authority=d.legal_authority, "
                     "c.place_registered=d.place_registered, "
                     "c.registered_country=d.registered_country;"), {"list": input_data})

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (c:Company {uid: d.uid}) "
                     "MERGE (country:Country {code: d.Registered_in}) "
                     "MERGE (c)-[:REGISTERED_IN]->(country);"), {"list": input_data})

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (p:Company {uid: d.uid}) "
                     "MERGE (ce:ControllingEntity {type: d.control_kind}) "
                     "MERGE (p)-[:HAS_CONTROL_KIND]->(ce);"), {"list": input_data})    

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (p:Company {uid: d.uid}) "
                     "MERGE (c:Company {uid: d.company_id}) "
                     "MERGE (p)-[r:CONTROLS]->(c) ON CREATE SET r=d.natures_of_control;"), {"list": input_data})

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (c:Company {uid: d.uid}) "
                     "MERGE (pc:Postcode {uid: d.address_PostCode}) "
                     "MERGE (c)-[:REGISTERED_IN]->(pc);"), {"list": input_data})

Using the odo library within blaze we can loop over our input data in chunks

In [42]:
driver = GraphDatabase.driver("bolt://10.0.0.1:7687", auth=("myusername", "mypassword"))

In [70]:
import blaze as bz

proc_records = 0
total_records = corporate_psc.shape[0]
for chunk in bz.odo(corporate_psc, target=bz.chunks(pd.DataFrame), chunksize=50000):
    neo_records_df = chunk.apply(lambda s: pd.Series(new_corporate_record(s)), axis=1)
    neo_records_df['Registered_in'] = neo_records_df.registered_country.apply(lambda x: combined_map.get(str(x).upper(), 'UNKNOWN'))

    input_data = [v for k,v in neo_records_df.T.to_dict().items()]
    del neo_records_df
    
    write_corporate_psc_to_neo(input_data, driver)
    proc_records += len(input_data)
    print("Processed {} of {} .... {:5.2f}% complete".format(proc_records, total_records, 100*proc_records/total_records))

Processed 50000 of 344866 .... 14.50% complete
Processed 100000 of 344866 .... 29.00% complete
Processed 150000 of 344866 .... 43.50% complete
Processed 200000 of 344866 .... 57.99% complete
Processed 250000 of 344866 .... 72.49% complete
Processed 300000 of 344866 .... 86.99% complete
Processed 344866 of 344866 .... 100.00% complete
