# Loading the Officers

We have a data file of officers who have a level of control over the company. We need to map these into the system to connect them to the PSC people we are aware of.

In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize

import blaze as bz

You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))


First load some sample data and do some tidying up so we don't waste RAM.

In [2]:
master_path = './data/'

In [3]:
sample_df = pd.read_csv(master_path+'officers.csv', nrows=10000)

In [4]:
sample_df.columns

Index(['id', 'company_number', 'jurisdiction_code', 'name', 'title',
       'first_name', 'last_name', 'position', 'start_date', 'person_number',
       'person_uid', 'end_date', 'current_status', 'occupation', 'nationality',
       'country_of_residence', 'partial_date_of_birth', 'type',
       'address.in_full', 'address.street_address', 'address.locality',
       'address.region', 'address.postal_code', 'address.country',
       'retrieved_at', 'source_url'],
      dtype='object')

In [5]:
sample_df[sample_df.end_date.isnull()].iloc[0]

id                                                                201499282
company_number                                                     05957738
jurisdiction_code                                                        gb
name                                                      DAVID IAN BROMLEY
title                                                                    MR
first_name                                                        DAVID IAN
last_name                                                           BROMLEY
position                                                           director
start_date                                                       2010-10-01
person_number                                                  115947850001
person_uid                                                         11594785
end_date                                                                NaN
current_status                                                          NaN
occupation  

In [6]:
sample_df[sample_df.type == 'Person'].head()

Unnamed: 0,id,company_number,jurisdiction_code,name,title,first_name,last_name,position,start_date,person_number,...,partial_date_of_birth,type,address.in_full,address.street_address,address.locality,address.region,address.postal_code,address.country,retrieved_at,source_url
1,201499282,5957738,gb,DAVID IAN BROMLEY,MR,DAVID IAN,BROMLEY,director,2010-10-01,115947850001,...,1970-05,Person,"40 LEA MANOR DRIVE\nPENN, WOLVERHAMPTON, WEST ...",40 LEA MANOR DRIVE\nPENN,WOLVERHAMPTON,WEST MIDLANDS,WV4 5PJ,UNITED KINGDOM,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/059...
2,201499298,5957738,gb,DAVID IAN BROMLEY,MR,DAVID IAN,BROMLEY,director,2006-10-05,115947850001,...,1970-05,Person,"40 LEA MANOR DRIVE, WOLVERHAMPTON, WV4 5PJ",40 LEA MANOR DRIVE,WOLVERHAMPTON,,WV4 5PJ,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/059...
3,201499319,5957738,gb,SALLY-ANN MICHELLE BROMLEY,MRS,SALLY-ANN MICHELLE,BROMLEY,director,2012-12-21,121484190003,...,1966-08,Person,"PENYCASTELL FARM BRYN, PORT TALBOT, WEST GLAMO...",PENYCASTELL FARM BRYN,PORT TALBOT,WEST GLAMORGAN,SA13 2PY,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/059...
4,201499336,5957738,gb,ELIZABETH DAVIES,MS,ELIZABETH,DAVIES,director,2006-10-05,115947860002,...,1962-01,Person,"12 PONTYMASON RISE\nROGERSTONE, NEWPORT, GWENT...",12 PONTYMASON RISE\nROGERSTONE,NEWPORT,GWENT,NP10 9GJ,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/059...
9,190421889,4066366,gb,RAZAHUSSEIN LALJI MAMDANI,MR,RAZAHUSSEIN LALJI,MAMDANI,director,2011-08-15,71984350001,...,1947-07,Person,"PREMIER SUITE, 4 CHURCHILL COURT\n58 STATION R...","PREMIER SUITE, 4 CHURCHILL COURT\n58 STATION ROAD",NORTH HARROW,MIDDLESEX,HA2 7ST,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/040...


In [7]:
sample_df[sample_df.title.isnull()]

Unnamed: 0,id,company_number,jurisdiction_code,name,title,first_name,last_name,position,start_date,person_number,...,partial_date_of_birth,type,address.in_full,address.street_address,address.locality,address.region,address.postal_code,address.country,retrieved_at,source_url
0,201499267,05957738,gb,PHOENIX AUDIT LIMITED,,,PHOENIX AUDIT LIMITED,secretary,2006-10-05,99992830001,...,,Company,"35 OXFORD STREET\nPONTYCYMER, BRIDGEND, CF32 8DD",35 OXFORD STREET\nPONTYCYMER,BRIDGEND,,CF32 8DD,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/059...
5,190421841,04066366,gb,FATBRAIN.COM LIMITED,,,FATBRAIN.COM LIMITED,secretary,2000-09-06,83435930001,...,,Company,PREMIER SUITE 4 CHURCHILL COURT\n58 STATION RO...,PREMIER SUITE 4 CHURCHILL COURT\n58 STATION ROAD,NORTH HARROW,MIDDLESEX,HA2 7ST,UNITED KINGDOM,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/040...
6,190421851,04066366,gb,FIRST SECRETARIES LIMITED,,,FIRST SECRETARIES LIMITED,secretary,2000-09-06,900000780001,...,,Company,"72 NEW BOND STREET, LONDON, W1S 1RR",72 NEW BOND STREET,LONDON,,W1S 1RR,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/040...
7,190421865,04066366,gb,A COMPANY LIMITED,,,+A COMPANY LIMITED,director,2000-09-06,72279610010,...,,Company,PREMIER SUITE 4 CHURCHILL COURT\n58 STATION RO...,PREMIER SUITE 4 CHURCHILL COURT\n58 STATION ROAD,NORTH HARROW,MIDDLESEX,HA2 7ST,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/040...
8,190421877,04066366,gb,FIRST DIRECTORS LIMITED,,,FIRST DIRECTORS LIMITED,director,2000-09-06,900000770001,...,1989-08,Company,"72 NEW BOND STREET, LONDON, W1S 1RR",72 NEW BOND STREET,LONDON,,W1S 1RR,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/040...
11,207362364,05438136,gb,INCORPORATE SECRETARIAT LIMITED,,,INCORPORATE SECRETARIAT LIMITED,secretary,2005-04-27,900028930001,...,,Company,"4TH FLOOR 3 TENTERDEN STREET\nHANOVER SQUARE, ...",4TH FLOOR 3 TENTERDEN STREET\nHANOVER SQUARE,LONDON,,W1S 1TD,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/054...
14,207362418,05438136,gb,INCORPORATE DIRECTORS LIMITED,,,INCORPORATE DIRECTORS LIMITED,director,2005-04-27,900030870001,...,,Company,"4TH FLOOR, 3 TENTERDEN STREET\nHANOVER SQUARE,...","4TH FLOOR, 3 TENTERDEN STREET\nHANOVER SQUARE",LONDON,,W1S 1TD,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/054...
16,212181681,SC322354,gb,PATRICIA ELIZABETH BUTLER,,PATRICIA ELIZABETH,BUTLER,secretary,2007-04-26,91946330002,...,1960-08,Person,"8 BERNISDALE GARDENS\nDRUMCHAPEL, GLASGOW, LAN...",8 BERNISDALE GARDENS\nDRUMCHAPEL,GLASGOW,LANARKSHIRE,G15 8BU,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/SC3...
17,212181694,SC322354,gb,THOMAS BUTLER,,THOMAS,BUTLER,director,2007-04-26,120439080001,...,1961-07,Person,"8 BERNISDALE GARDENS, GLASGOW, STRATHCLYDE, G1...",8 BERNISDALE GARDENS,GLASGOW,STRATHCLYDE,G15 8BU,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/SC3...
18,205069123,05149111,gb,CLIFFORD CHANCE SECRETARIES (CCA) LIMITED,,,CLIFFORD CHANCE SECRETARIES (CCA) LIMITED,secretary,2004-11-16,83911920002,...,,Company,"10 UPPER BANK STREET, LONDON, E14 5JJ",10 UPPER BANK STREET,LONDON,,E14 5JJ,,2015-12-04 00:00:00 UTC,https://beta.companieshouse.gov.uk/company/051...


From this and further exploring the data there are two "types" of officer in the data; people and companies.

We will start with how to map People officers into the database. We will also ignore officers who have an "end date" as they are no longer actively connected to the company.

## People who are officers

First we will define a number of functions that can map the data into the node objects that we want to hold for people who are officers.

In [8]:
def officer_dob(record):
    "Function to create dob string identifier for person"
    record.fillna('')
    data = record['partial_date_of_birth']
    try:
        year, month = data.split('-')
        dob_segment = "{}/{}".format(str(month).zfill(2), year)
        return dob_segment
    except (AttributeError, TypeError, ValueError) as e:
        return '00/0000'

def officer_name_id(record):
    "Function to create name string identifier for person"
    fullname = record['name'].split()
    data = {
        'first_name': fullname[0],
        'last_name': record['last_name'],
        'middle_name': ' '.join(fullname[1:-1])
    }
    try:
        name_segment = "{}_{}_{}".format(data.get('last_name', '').upper(), 
                                         data.get('middle_name', '').lower(),
                                         data.get('first_name', '').lower())
        return name_segment.replace(' ', '-')
    except TypeError as e:
        return 'NONAME__BLANK'
    except AttributeError:
        return 'NONAME__BLANK'
    
def officer_uid(record):
    "Function to create a unique ID for someone from their name and dob"
    components = [officer_name_id(record), officer_dob(record)]
    return {'uid': ':'.join(components)}

def officer_name(record):
    "Function to create name record of person"
    fullname = record['name'].split()
    data = {
        'forename': fullname[0].title(),
        'surname': record['last_name'].title(),
        'middle_name': ' '.join(fullname[1:-1]).title(),
        'title': record['title'].title(),
        'name': ' '.join([record['title'], record['name']])
    }    
    try:
        name_record = {
            'surname': data.get('surname', ''),
            'middle_name': data.get('middle_name', ''),
            'forename': data.get('forename', ''),
            'title': data.get('title', ''),
            'name': data.get('name', '').upper()
        }
        return name_record
    except TypeError as e:
        return {}
    except AttributeError:
        return {}

def officer_address(record):
    "Function to create a address details for person"
    record.fillna('', inplace=True)
    address = record['address.street_address'].split('\\n')
    address_Line1 = address[0]
    address_Line2 = ''
    if len(address)>1:
        address_Line2 = address[1]

    data = {
        'address_line_1': address_Line1,
        'address_line_2': address_Line2,
        'locality': record['address.locality'],
        'region': record['address.region'],
        'postal_code': record['address.postal_code'],
        'country': record['address.country']
    }
    try:
        new_address = {
            'address_Line1': data.get('address_line_1', ''),
            'address_Line2': data.get('address_line_2', ''),
            'address_PostTown': data.get('locality', '').upper(),
            'address_POBox': data.get('po_box', ''),
            'address_County': data.get('region', '').upper(),
            'address_PostCode': data.get('postal_code', '').upper(),
            'address_Country': data.get('country', '').upper(),
            'address_InFull': record['address.in_full']
        }
        return new_address
    except TypeError as e:
        return {'address_Country': "UNKNOWN"}
    except AttributeError:
        return {'address_Country': "UNKNOWN"}
    
def officer_details(record):
    "Function to create a relationship details for person"
    details = {}
    details['company_id'] = str(record['company_number'])
    details['officer_kind'] = record['position'].upper()
    details['nationality'] = record['nationality']
    details['ceased_on'] = record['end_date']
    details['source_url'] = record['source_url']
    details['country_of_residence'] = record['country_of_residence']
    details['person_uid'] = str(record['person_uid'])
    dob = record['partial_date_of_birth']
    try:
        year, month = dob.split('-')
        details['DOB'] = "{}/{}".format(str(month).zfill(2), year)
    except (TypeError, ValueError) as e:
        pass
    return details
    

def new_officer_record(record):
    "Function to create a new record that is flat"
    uid = officer_uid(record)
    address = officer_address(record)
    return {**uid, **address, 
            **officer_details(record), 
            **officer_name(record)}

In [9]:
sample_df[sample_df.type == 'Person'].apply(new_officer_record, axis=1).iloc[2]

{'DOB': '08/1966',
 'address_Country': '',
 'address_County': 'WEST GLAMORGAN',
 'address_InFull': 'PENYCASTELL FARM BRYN, PORT TALBOT, WEST GLAMORGAN, SA13 2PY',
 'address_Line1': 'PENYCASTELL FARM BRYN',
 'address_Line2': '',
 'address_POBox': '',
 'address_PostCode': 'SA13 2PY',
 'address_PostTown': 'PORT TALBOT',
 'ceased_on': '',
 'company_id': '05957738',
 'country_of_residence': 'ENGLAND',
 'forename': 'Sally-Ann',
 'middle_name': 'Michelle',
 'name': 'MRS SALLY-ANN MICHELLE BROMLEY',
 'nationality': 'BRITISH',
 'officer_kind': 'DIRECTOR',
 'person_uid': '12148419',
 'source_url': 'https://beta.companieshouse.gov.uk/company/05957738',
 'surname': 'Bromley',
 'title': 'Mrs',
 'uid': 'BROMLEY_michelle_sally-ann:08/1966'}

We will use the reference objects that we have built before to know how to map countries and nationalities

In [10]:
country_code_map = pd.read_pickle('./data/clean_country_code_map.pkl')
combined_map = pd.read_pickle('./data/combined_country_map.pkl')
nationality_map = pd.read_pickle('./data/nation_map.pkl')

Let's wrap up our processing data into a function that can act on an input dataframe object

In [11]:
def officer_etl(original_df):
    original_df = original_df[original_df.type == 'Person']
    original_df = original_df[original_df.end_date.isnull()]
    processed_df = original_df.apply(lambda s: pd.Series(new_officer_record(s)), axis=1)
    processed_df['Citizen_of'] = processed_df.nationality.apply(lambda x: nationality_map.get(x.lower(), ''))
    processed_df['Registered_in'] = processed_df.address_Country.apply(lambda x: combined_map.get(x.lower(), ''))
    processed_df['Resident_in'] = processed_df.country_of_residence.apply(lambda x: combined_map.get(x.lower(), ''))
    input_data = [v for k,v in processed_df.T.to_dict().items()]
    return input_data

Example test of action a sample of the data

In [12]:
test = officer_etl(sample_df.head(20))
print(len(test))
print(test[0])

8
{'DOB': '05/1970', 'address_Country': 'UNITED KINGDOM', 'address_County': 'WEST MIDLANDS', 'address_InFull': '40 LEA MANOR DRIVE\\nPENN, WOLVERHAMPTON, WEST MIDLANDS, WV4 5PJ, UNITED KINGDOM', 'address_Line1': '40 LEA MANOR DRIVE', 'address_Line2': 'PENN', 'address_POBox': '', 'address_PostCode': 'WV4 5PJ', 'address_PostTown': 'WOLVERHAMPTON', 'ceased_on': '', 'company_id': '05957738', 'country_of_residence': 'UNITED KINGDOM', 'forename': 'David', 'middle_name': 'Ian', 'name': 'MR DAVID IAN BROMLEY', 'nationality': 'BRITISH', 'officer_kind': 'DIRECTOR', 'person_uid': '11594785', 'source_url': 'https://beta.companieshouse.gov.uk/company/05957738', 'surname': 'Bromley', 'title': 'Mr', 'uid': 'BROMLEY_ian_david:05/1970', 'Citizen_of': 'GB', 'Registered_in': '', 'Resident_in': ''}


## Looping through all of the data and pushing it into Neo4j

In [14]:
from neo4j.v1 import GraphDatabase
driver = GraphDatabase.driver("bolt://10.0.0.1:7687", auth=("myusername", "mypassword"))

In [15]:
num=0
done=0
chunks = pd.read_csv(master_path+'officers.csv', chunksize=100000)
for chunk in chunks:
    input_data = officer_etl(chunk)
    num+=1
    done+=len(input_data)
    print(num)
    if len(input_data) > 0:
        with driver.session() as session:
            session.run(("UNWIND {list} AS d "
                         "MERGE (c:Person {uid: d.uid}) "
                         "ON CREATE SET c.forename=d.forename, "
                         "c.middle_name=d.middle_name, "
                         "c.surname=d.surname, "
                         "c.title=d.title, "
                         "c.fullname=d.name, "
                         "c.dob=d.DOB, "
                         "c.person_uid=d.person_uid, "
                         "c.nationality=d.nationality, "
                         "c.address_Line1=d.address_Line1, "
                         "c.address_Line2=d.address_Line2, "
                         "c.address_PostTown=d.address_PostTown, "
                         "c.address_County=d.address_County, "
                         "c.address_PostCode=d.address_Postcode, "                 
                         "c.address_Country=d.address_Country, "                 
                         "c.uri=d.source_url;"), {"list": input_data})

        with driver.session() as session:
            session.run(("UNWIND {list} AS d "
                         "MATCH (c:Person {uid: d.uid}) "
                         "MERGE (country:Country {code: d.Registered_in}) "
                         "MERGE (c)-[:REGISTERED_IN]->(country);"), {"list": input_data})

        with driver.session() as session:
            session.run(("UNWIND {list} AS d "
                         "MATCH (c:Person {uid: d.uid}) "
                         "MERGE (country:Country {code: d.Citizen_of}) "
                         "MERGE (c)-[:CITIZEN_OF]->(country);"), {"list": input_data})

        with driver.session() as session:
            session.run(("UNWIND {list} AS d "
                         "MATCH (p:Person {uid: d.uid}) "
                         "SET p:Officer;"), {"list": input_data})    

        with driver.session() as session:
            session.run(("UNWIND {list} AS d "
                         "MATCH (p:Person {uid: d.uid}) "
                         "MERGE (c:Company {uid: d.company_id}) "
                         "CREATE (p)-[r:IS_OFFICER_OF {type: d.officer_kind, ceased_on: d.ceased_on}]->(c);"), {"list": input_data})
        print("Finished chunk...{}%".format(done/40386527))
    else:
        print("No records in this chunk")
print("DONE!")        

1
Finished chunk...0.0010879618344008634%
2
DONE!
