# Let's try and create the company nodes within neo4j

Loop through the active company list and generate the company nodes

In [1]:
from neo4j.v1 import GraphDatabase
import pandas as pd

In [2]:
driver = GraphDatabase.driver("bolt://10.0.0.1:7687", auth=("myusername", "mypassword"))

Let's take a peek at the data

In [3]:
company_sample = pd.read_csv('./data/BasicCompanyDataAsOneFile-2017-09-01.csv', nrows=1000)

In [4]:
cols = [col.strip() for col in company_sample.columns]
cols = [col.replace('.', '_') for col in cols]
company_sample.columns = cols

In [5]:
company_sample.iloc[-2]

CompanyName                                                           0131 TACFIT LTD
CompanyNumber                                                                SC540913
RegAddress_CareOf                                                                 NaN
RegAddress_POBox                                                                  NaN
RegAddress_AddressLine1                                  134B5 PORTOBELLO HIGH STREET
RegAddress_AddressLine2                                                           NaN
RegAddress_PostTown                                                         EDINBURGH
RegAddress_County                                                                 NaN
RegAddress_Country                                                           SCOTLAND
RegAddress_PostCode                                                          EH15 1AH
CompanyCategory                                               Private Limited Company
CompanyStatus                                         

## We shall only worry about current company names.

We will need out country references again.

In [6]:
country_code_map = pd.read_pickle('./data/clean_country_code_map.pkl')
combined_map = pd.read_pickle('./data/combined_country_map.pkl')

In [7]:
import numpy as np
company_sample = company_sample.replace(np.nan, u'', regex=True)

In [8]:
company_sample['CountryCode'] = company_sample.RegAddress_Country.map(lambda x: combined_map.get(str(x).upper(), 'UNKNOWN'))
company_sample['CleanCountry'] = company_sample.apply(lambda x: country_code_map.get(x['CountryCode'], 'UNKNOWN') 
                                        if x['CountryCode'] in country_code_map.keys() 
                                        else x.RegAddress_Country, axis=1) 
company_sample['CleanCountry'] = company_sample.CleanCountry.map(lambda x: 'NO_COUNTRY_LISTED' if x == '' else x)

company_sample['CountryOfOriginCode'] = company_sample.CountryOfOrigin.map(lambda x: combined_map.get(str(x).upper(), 'UNKNOWN'))
company_sample['CleanCountryOfOrigin'] = company_sample.apply(lambda x: country_code_map.get(x['CountryOfOriginCode'], 'UNKNOWN') 
                                        if x['CountryOfOriginCode'] in country_code_map.keys() 
                                        else x.CountryOfOrigin, axis=1)
company_sample['CleanCountryOfOrigin'] = company_sample.CleanCountryOfOrigin.map(lambda x: 'NO_COUNTRY_LISTED' if x == '' else x)

In [9]:
company_sample['CleanPostcode'] = company_sample.RegAddress_PostCode.map(lambda x: 'UNKNOWN' if x == '' else x)
company_sample.CleanPostcode.value_counts().sum()

1000

In [10]:
company_sample.CompanyNumber.isnull().value_counts()

False    1000
Name: CompanyNumber, dtype: int64

In [11]:
input_data = [v for k,v in company_sample.T.to_dict().items()]

In [12]:
input_data[0]

{'Accounts_AccountCategory': 'DORMANT',
 'Accounts_AccountRefDay': 30.0,
 'Accounts_AccountRefMonth': 9.0,
 'Accounts_LastMadeUpDate': '30/09/2016',
 'Accounts_NextDueDate': '30/06/2018',
 'CleanCountry': 'NO_COUNTRY_LISTED',
 'CleanCountryOfOrigin': 'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND',
 'CleanPostcode': 'LS10 2RU',
 'CompanyCategory': 'Private Limited Company',
 'CompanyName': '! LTD',
 'CompanyNumber': '08209948',
 'CompanyStatus': 'Active',
 'ConfStmtLastMadeUpDate': '11/09/2016',
 'ConfStmtNextDueDate': '25/09/2019',
 'CountryCode': 'UNKNOWN',
 'CountryOfOrigin': 'United Kingdom',
 'CountryOfOriginCode': 'GB',
 'DissolutionDate': '',
 'IncorporationDate': '11/09/2012',
 'LimitedPartnerships_NumGenPartners': 0,
 'LimitedPartnerships_NumLimPartners': 0,
 'Mortgages_NumMortCharges': 0,
 'Mortgages_NumMortOutstanding': 0,
 'Mortgages_NumMortPartSatisfied': 0,
 'Mortgages_NumMortSatisfied': 0,
 'PreviousName_10_CONDATE': '',
 'PreviousName_10_CompanyName': '',
 'Previ

Company node properties:
- name:
- number:
- Accounts.LastMadeUpDate:
- Returns.LastMadeUpDate: '11/09/2015'
- Returns.NextDueDate: '09/10/2016'
- Address.Line1:
- Address.Line2:
- Address.Country
- Address.PostCode
- Address.PostTown
- Address.POBox
- Address.County
- URI: 'http://business.data.gov.uk/id/company/08209948'

status nodes:
- type: DORMANT, Active

company_category nodes:
- type: PLC ...

Country:
- name: United Kingdom ...

```
UNWIND {list} AS d
MATCH (p:Person {user_id: d.id})
MERGE (a:Artist {artist_name: d.name})
MERGE (p)-[:LIKES {times: d.plays}]->(a)

 'Accounts.LastMadeUpDate': '30/09/2016',
 'Accounts.NextDueDate': '30/06/2018',
Returns.LastMadeUpDate: '11/09/2015'
Returns.NextDueDate:
```

# Looping over all the data and inserting the data into the neo4j Database

Here we will chunk over the input file in batches of 100,000 records and use the functions we've tested above to create the node properties and format to allow us to do a batch CYPHER query that will create and connect the nodes and relationships.

In [None]:
chunks = pd.read_csv('./data/BasicCompanyDataAsOneFile-2017-09-01.csv', chunksize=100000)
for chunk in chunks:
    cols = [col.strip() for col in chunk.columns]
    cols = [col.replace('.', '_') for col in cols]
    chunk.columns = cols
    chunk = chunk.replace(np.nan, u'', regex=True)
    chunk['CountryCode'] = chunk.RegAddress_Country.map(lambda x: combined_map.get(str(x).upper(), 'UNKNOWN'))
    chunk['CleanCountry'] = chunk.apply(lambda x: country_code_map.get(x['CountryCode'], 'UNKNOWN') 
                                            if x['CountryCode'] in country_code_map.keys() 
                                            else x.RegAddress_Country, axis=1) 
    chunk['CleanCountry'] = chunk.CleanCountry.map(lambda x: 'NO_COUNTRY_LISTED' if x == '' else x)

    chunk['CountryOfOriginCode'] = chunk.CountryOfOrigin.map(lambda x: combined_map.get(str(x).upper(), 'UNKNOWN'))
    chunk['CleanCountryOfOrigin'] = chunk.apply(lambda x: country_code_map.get(x['CountryOfOriginCode'], 'UNKNOWN') 
                                            if x['CountryOfOriginCode'] in country_code_map.keys() 
                                            else x.CountryOfOrigin, axis=1)
    chunk['CleanCountryOfOrigin'] = chunk.CleanCountryOfOrigin.map(lambda x: 'NO_COUNTRY_LISTED' if x == '' else x)
    chunk['CleanPostcode'] = chunk.RegAddress_PostCode.map(lambda x: 'UNKNOWN' if x == '' else x)
    
    test = {'list': [v for k,v in chunk.T.to_dict().items()]}

    print('Starting Insert ....')
    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MERGE (c:Company {uid: d.CompanyNumber}) "
                     "ON CREATE SET c.name=d.CompanyName, "
                     "c.accounts_LastMadeUpDate=d.Accounts_LastMadeUpDate, "
                     "c.accounts_NextDueDate=d.Accounts_NextDueDate, "
                     "c.returns_LastMadeUpDate=d.Returns_LastMadeUpDate, "
                     "c.returns_NextDueDate=d.Returns_NextDueDate, "
                     "c.address_Line1=d.RegAddress_Line1, "
                     "c.address_Line2=d.RegAddress_Line2, "
                     "c.address_PostTown=d.RegAddress_PostTown, "
                     "c.address_POBox=d.RegAddress_POBox, "
                     "c.address_County=d.RegAddress_County, "
                     "c.address_PostCode=d.RegAddress_Postcode, "                 
                     "c.address_Country=d.CleanCountry, "                 
                     "c.uri=d.URI;"), {"list": test.get('list')})

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (c:Company {uid: d.CompanyNumber}) "
                     "MERGE (country:Country {code: d.CountryCode}) "
                     "MERGE (c)-[:REGISTERED_IN]->(country);"), {"list": test.get('list')})

    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (c:Company {uid: d.CompanyNumber}) "
                     "MERGE (country:Country {code: d.CountryOfOriginCode}) "
                     "MERGE (c)-[:HAS_ORIGIN]->(country);"), 
                    {"list": test.get('list')})
        
    with driver.session() as session:
        session.run(("UNWIND {list} AS d "
                     "MATCH (c:Company {uid: d.CompanyNumber}) "
                     "MERGE (pc:Postcode {uid: d.CleanPostcode}) "
                     "MERGE (c)-[:REGISTERED_IN]->(pc);"), 
                    {"list": test.get('list')})
    print("Finished chunk...")
print("DONE!")

  interactivity=interactivity, compiler=compiler, result=result)


Starting Insert ....
Finished chunk...
Starting Insert ....
Finished chunk...


  interactivity=interactivity, compiler=compiler, result=result)


Starting Insert ....
Finished chunk...


  interactivity=interactivity, compiler=compiler, result=result)


Starting Insert ....
Finished chunk...


  interactivity=interactivity, compiler=compiler, result=result)


Starting Insert ....
Finished chunk...


  interactivity=interactivity, compiler=compiler, result=result)


Starting Insert ....
Finished chunk...


  interactivity=interactivity, compiler=compiler, result=result)


Starting Insert ....


### Quick check how any companies have been inserted

In [19]:
with driver.session() as session:
    result = session.run("MATCH (c:Company) RETURN COUNT(c);")
    print(result.data())

[{'COUNT(c)': 4077979}]
