# Inserting Corporate PSCs

A significant number of controlling entities are the corporations and companies that control other companies. Here we load the data and prepare it for insertion into the graph.

In [3]:
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np
from neo4j.v1 import GraphDatabase

In [4]:
original_psc_data = pd.read_json('../data/psc_snapshot-2017-09-08.json')
all_records_psc = pd.concat([original_psc_data['company_number'],json_normalize(original_psc_data['data'])],axis=1)
del original_psc_data

In [5]:
all_records_psc.head()

Unnamed: 0,company_number,address.address_line_1,address.address_line_2,address.care_of,address.country,address.locality,address.po_box,address.postal_code,address.premises,address.region,...,name_elements.middle_name,name_elements.surname,name_elements.title,nationality,natures_of_control,notified_on,persons_of_significant_control_count,restrictions_notice_withdrawal_reason,statement,statements_count
0,9145694,St. Andrews Road,,,England,Henley-On-Thames,,RG9 1HP,2,,...,Thanh,Wildman,Mrs,Vietnamese,[ownership-of-shares-50-to-75-percent],2016-04-06,,,,
1,8581893,High Street,Wendover,,England,Aylesbury,,HP22 6EA,14a,Buckinghamshire,...,Robert Charles,Davies,Mr,British,"[ownership-of-shares-25-to-50-percent, ownersh...",2016-06-30,,,,
2,8581893,Holywells Road,,,United Kingdom,Ipswich,,IP3 0DL,37-41,Suffolk,...,Fiona,Tarrant,,British,"[ownership-of-shares-25-to-50-percent, voting-...",2016-04-06,,,,
3,1605766,20-22 Wenlock Road,,,England,London,,N1 7GU,Suite Lp33221,,...,,,,,[ownership-of-shares-75-to-100-percent],2016-04-06,,,,
4,10259080,38 Church Road,Worcester Park,,,Surrey,,KT4 7RD,,,...,Peter,Ollett,Mr,British,[ownership-of-shares-50-to-75-percent],2016-06-30,,,,


In [6]:
all_records_psc.kind.value_counts()

individual-person-with-significant-control          4225140
persons-with-significant-control-statement           404603
corporate-entity-person-with-significant-control     344866
legal-person-person-with-significant-control           5490
super-secure-person-with-significant-control            186
exemptions                                               37
totals#persons-of-significant-control-snapshot            1
Name: kind, dtype: int64

## Filter the data to only handle corporate control

In [7]:
legal_psc = all_records_psc[all_records_psc.kind == "legal-person-person-with-significant-control"]
del all_records_psc

In [8]:
legal_psc.head()

Unnamed: 0,company_number,address.address_line_1,address.address_line_2,address.care_of,address.country,address.locality,address.po_box,address.postal_code,address.premises,address.region,...,name_elements.middle_name,name_elements.surname,name_elements.title,nationality,natures_of_control,notified_on,persons_of_significant_control_count,restrictions_notice_withdrawal_reason,statement,statements_count
1516,9591316,Parliament Street,,,England,London,,SW1A 2BQ,100,,...,,,,,"[right-to-appoint-and-remove-directors, right-...",2016-04-06,,,,
1527,6633035,Parliament Street,,,England,London,,SW1A 2BQ,100,,...,,,,,"[right-to-appoint-and-remove-directors, right-...",2016-04-06,,,,
2806,10259583,,,,United Kingdom,Douglas,,IM99 1TT,12-14 Finch Road,Isle Of Man,...,,,,,[ownership-of-shares-75-to-100-percent-as-firm...,2016-04-06,,,,
6392,8586297,Glenurquhart Road,,,Scotland,Inverness,,IV3 5NX,Council Offices,,...,,,,,[voting-rights-25-to-50-percent-limited-liabil...,2016-04-06,,,,
8093,10261145,Adamslie Crescent,Kirkintilloch,,Scotland,Glasgow,,G66 1BL,9,,...,,,,,"[right-to-appoint-and-remove-directors, right-...",2016-04-07,,,,


In [9]:
def convert_control_list(control_list):
    """Function to convert a list of controls into a dictionary with the controls as a key and a value of 1"""
    if isinstance(control_list, list):
        return {key: 1 for key in control_list}
    else:
        return {}

In [10]:
legal_psc['DICTIONARY_OF_CONTROLS'] = legal_psc['natures_of_control'].map(convert_control_list)

## It appears that the company_number does NOT correspond to the company_id for legal PSCs
** INSTEAD we must extract it from the links.self path
e.g. links.self /company/07804290/persons-with-significant-control/corporate-entity/jc9QvYXKVQMFGOTWsXH-XVt25eU **

In [11]:
for k, v in legal_psc[legal_psc['identification.registration_number'].isnull()].iloc[1].items():
    print(k, v)

company_number 06633035
address.address_line_1 Parliament Street
address.address_line_2 nan
address.care_of nan
address.country England
address.locality London
address.po_box nan
address.postal_code SW1A 2BQ
address.premises 100
address.region nan
ceased nan
ceased_on nan
country_of_residence nan
date_of_birth.month nan
date_of_birth.year nan
description nan
etag 1f211f706b45a227a3c14f114d5b5bb3a7dd0042
exemptions.psc_exempt_as_shares_admitted_on_market.exemption_type nan
exemptions.psc_exempt_as_shares_admitted_on_market.items nan
exemptions.psc_exempt_as_trading_on_regulated_market.exemption_type nan
exemptions.psc_exempt_as_trading_on_regulated_market.items nan
exemptions_count nan
generated_at nan
identification.country_registered nan
identification.legal_authority English
identification.legal_form Executive Chair And Permanent Secretary
identification.place_registered nan
identification.registration_number nan
kind legal-person-person-with-significant-control
linked_psc_name nan
l

In [12]:
country_code_map = pd.read_pickle('./data/clean_country_code_map.pkl')
combined_map = pd.read_pickle('./data/combined_country_map.pkl')
nationality_map = pd.read_pickle('./data/nation_map.pkl')

### Defining specific functions to handle human PSCs

These functions will convert the raw data for human PSCs into a format that can then be inderted into Neo4j in an easy fashion.

In [14]:
def legal_psc_name(record):
    "Function to create name string identifier"
    data = record['name']
    try:
        return data.upper()
    except TypeError as e:
        return 'NONAME__BLANK'
    except AttributeError:
        return 'NONAME__BLANK'
    
def legal_psc_uid(record):
    "Function to create a unique ID for a legal entity"
    data = record['identification.registration_number']
    uid = str(data).replace('nan', 'MISSING')
    if uid == 'MISSING':
        uid = '-'.join(legal_psc_name(record).split()) 
        
    return {'uid': uid,
            'legal_authority': str(record['identification.legal_authority']).replace('nan', 'MISSING'),
            'legal_form': str(record['identification.legal_form']).replace('nan', 'MISSING')}


def legal_psc_address(record):
    "Function to create a address details for legal entity"
    try:
        new_address = {
            'premises': str(record['address.premises']),
            'address_Line1': str(record['address.address_line_1']),
            'address_Line2': str(record['address.address_line_2']),
            'address_PostTown': str(record['address.locality']).upper(),
            'address_POBox': str(record['address.po_box']),
            'address_County': str(record['address.region']).upper(),
            'address_PostCode': str(record['address.postal_code']).upper(),
            'address_Country': str(record['address.country']).upper(),
            'address_CareOf': str(record['address.care_of'])
        }
        return {k: v.replace('nan', '').replace('NAN', '') for k,v in new_address.items()}
    except TypeError as e:
        return {'address_Country': "UNKNOWN"}
    except AttributeError:
        return {'address_Country': "UNKNOWN"}
    
    
def legal_psc_details(record):
    "Function to create a relationship details for person"
    details = {}
    details['company_id'] = record['links.self'].split('/')[2]
    details['control_kind'] = record['kind']
    return details
    
    
def new_legal_record(record):
    "Function to create a new record that is flat"
    uid = legal_psc_uid(record)
    address = legal_psc_address(record)
    return {**uid, **address, 
            **legal_psc_details(record), 
            'name': legal_psc_name(record), 
            'natures_of_control': record['DICTIONARY_OF_CONTROLS']}

Time to test some of these functions

In [15]:
legal_psc.iloc[0:5].apply(lambda s: pd.Series(new_legal_record(s)), axis=1)

Unnamed: 0,address_CareOf,address_Country,address_County,address_Line1,address_Line2,address_POBox,address_PostCode,address_PostTown,company_id,control_kind,legal_authority,legal_form,name,natures_of_control,premises,uid
1516,,ENGLAND,,Parliament Street,,,SW1A 2BQ,LONDON,09679225,legal-person-person-with-significant-control,English,Chief Executive And Permanent Secretary,JON THOMPSON,"{'right-to-appoint-and-remove-directors': 1, '...",100,JON-THOMPSON
1527,,ENGLAND,,Parliament Street,,,SW1A 2BQ,LONDON,09679225,legal-person-person-with-significant-control,English,Executive Chair And Permanent Secretary,EDWARD TROUP,"{'right-to-appoint-and-remove-directors': 1, '...",100,EDWARD-TROUP
2806,,UNITED KINGDOM,ISLE OF MAN,,,,IM99 1TT,DOUGLAS,03443850,legal-person-person-with-significant-control,Isle Of Man Companies Act 1931,Limited Company,FABSON IMPORT EXPORT LTD,{'ownership-of-shares-75-to-100-percent-as-fir...,12-14 Finch Road,FABSON-IMPORT-EXPORT-LTD
6392,,SCOTLAND,,Glenurquhart Road,,,IV3 5NX,INVERNESS,SO304104,legal-person-person-with-significant-control,Local Government Etc (Scotland) Act,Local Authority,THE HIGHLAND COUNCIL,{'voting-rights-25-to-50-percent-limited-liabi...,Council Offices,THE-HIGHLAND-COUNCIL
8093,,SCOTLAND,,Adamslie Crescent,Kirkintilloch,,G66 1BL,GLASGOW,SC209237,legal-person-person-with-significant-control,Uk,Director And Chairperson,DAVID P BOOT,"{'right-to-appoint-and-remove-directors': 1, '...",9,DAVID-P-BOOT


In [16]:
neo_records_df = legal_psc.head(10).apply(lambda s: pd.Series(new_legal_record(s)), axis=1)
neo_records_df['Registered_in'] = neo_records_df.address_Country.apply(lambda x: combined_map.get(x.upper(), ''))

neo_records_df.tail(5)

Unnamed: 0,address_CareOf,address_Country,address_County,address_Line1,address_Line2,address_POBox,address_PostCode,address_PostTown,company_id,control_kind,legal_authority,legal_form,name,natures_of_control,premises,uid,Registered_in
8763,,UNITED KINGDOM,,Grove Road,Mollington,,CH1 6LG,CHESTER,08281151,legal-person-person-with-significant-control,England,Legal Person,MATTHEW JAMES LLOYD SHERLOCK,{'ownership-of-shares-75-to-100-percent': 1},Holbeck,MATTHEW-JAMES-LLOYD-SHERLOCK,GB
9540,,ENGLAND,,Commercial Road,,,E1 2PY,LONDON,07127998,legal-person-person-with-significant-control,Uk,Director,IRAH MIAH,{'ownership-of-shares-75-to-100-percent': 1},326,IRAH-MIAH,GB
10161,,ENGLAND,,High Street,,,TW11 8EE,TEDDINGTON,05949295,legal-person-person-with-significant-control,Uk Company Law,Limited Company,KNOWLEDGE & MERCHANDINGING INC. LIMITED,"{'ownership-of-shares-50-to-75-percent': 1, 'v...",Harlequin House,KNOWLEDGE-&-MERCHANDINGING-INC.-LIMITED,GB
10233,,SCOTLAND,,,,,EH3 8EJ,EDINBUURGH,04926894,legal-person-person-with-significant-control,England And Wales,Company Limited By Guarantee,WALK THE WALK WORLDWIDE,{'ownership-of-shares-75-to-100-percent-as-fir...,5 Atholl Crescent,WALK-THE-WALK-WORLDWIDE,GB
10429,,SINGAPORE,,The Treasury,100 High Street,,179434,#06-03,SC074783,legal-person-person-with-significant-control,Singapore,Corporation Sole,MINISTER FOR FINANCE,{'significant-influence-or-control': 1},Ministry Of Fice,MINISTER-FOR-FINANCE,SG


## Now to insert the Legal PSCs

Now we can loop over the larger set of data and insert all of the active PSCs

In [17]:
def write_legal_psc_to_neo(input_data, driver):
    """Function writes legal records to Neo4j database
    From a list of dictionaries construct the appropriate nodes and relationships to be inserted into Neo4j
    :param input_data - a list of dictionaries that have all the required information for nodes and relationships
    :param driver - an active driver object to connect to a neo4j instance
    :return """
    
    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MERGE (c:Other_entity {uid: d.uid}) "
                     "ON CREATE SET c.name=d.name, "
                     "c.address_premises=d.premises, "
                     "c.address_Line1=d.address_Line1, "
                     "c.address_Line2=d.address_Line2, "
                     "c.address_PostTown=d.address_PostTown, "
                     "c.address_POBox=d.address_POBox, "
                     "c.address_County=d.address_County, "
                     "c.address_PostCode=d.address_Postcode, "                 
                     "c.address_Country=d.address_Country, "
                     "c.legal_authority=d.legal_authority, "
                     "c.legal_form=d.legal_form;"), {"list": input_data})

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (c:Other_entity {uid: d.uid}) "
                     "MERGE (country:Country {code: d.Registered_in}) "
                     "MERGE (c)-[:REGISTERED_IN]->(country);"), {"list": input_data})

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (p:Other_entity {uid: d.uid}) "
                     "MERGE (ce:ControllingEntity {type: d.control_kind}) "
                     "MERGE (p)-[:HAS_CONTROL_KIND]->(ce);"), {"list": input_data})    

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (p:Other_entity {uid: d.uid}) "
                     "MERGE (c:Company {uid: d.company_id}) "
                     "MERGE (p)-[r:CONTROLS]->(c) ON CREATE SET r=d.natures_of_control;"), {"list": input_data})
        
    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (c:Other_entity {uid: d.uid}) "
                     "MERGE (pc:Postcode {uid: d.address_PostCode}) "
                     "MERGE (c)-[:REGISTERED_IN]->(pc);"), {"list": input_data})

Using the odo library within blaze we can loop over our input data in chunks

In [18]:
driver = GraphDatabase.driver("bolt://10.0.0.1:7687", auth=("myusername", "mypassword"))

In [19]:
import blaze as bz

proc_records = 0
total_records = legal_psc.shape[0]
for chunk in bz.odo(legal_psc, target=bz.chunks(pd.DataFrame), chunksize=5000):
    neo_records_df = chunk.apply(lambda s: pd.Series(new_legal_record(s)), axis=1)
    neo_records_df['Registered_in'] = neo_records_df.address_Country.apply(lambda x: combined_map.get(str(x).upper(), 'UNKNOWN'))

    input_data = [v for k,v in neo_records_df.T.to_dict().items()]
    del neo_records_df
    
    write_legal_psc_to_neo(input_data, driver)
    proc_records += len(input_data)
    print("Processed {} of {} .... {:5.2f}% complete".format(proc_records, total_records, 100*proc_records/total_records))

You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))


Processed 5000 of 5490 .... 91.07% complete
Processed 5490 of 5490 .... 100.00% complete
