In [1]:
## Potentially required installs: ##

# Install a pip package in the current Jupyter kernel
# Code taken from: https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
import sys
!{sys.executable} -m pip install PyPDF2
import sys
!{sys.executable} -m pip install pymysql
import sys
!{sys.executable} -m pip install mysql-connector





In [2]:
## Import required packages: ##
import requests
import json
import math
import numpy
import shutil # For copying files
import time # For waiting x seconds
from datetime import date as dt # get todays date
import pandas as pd
from numpy import nan
#import PyPDF2             # For reading PDFs
#from pathlib import Path  # For writing/saving PDFs from requests
import pymysql
import mysql.connector
import os.path # Checking whether file exists in current directory

## Step 1 - Identify Companies Of Interest ##

**GetLocalActiveCompanies(api_key, location, numberOfPages=None)**  
Gives basic identifying information on all active companies in a given
location.

Input - api_key, location of interest, numberOfPages of results.  
- If numberOfPages=None then all available results will be provided.
- The current number of results per page is 20, so 2 pages gives 40 results.  


Output - Four lists, indexed equally.  
- listOfCompanyNumbers - Company number.  
- listOfCompanyNames -  Company name.  
- listOfCompanySICCodes - All of the companies SIC codes.  
- listOfCompanyAddresses - Companies full registered office address.

In [3]:
def APIRateLimitHandler(currentCount, maxCountLimit, waitPeriod):
    currentCount = currentCount + 1
    if(currentCount == maxCountLimit): 
        time.sleep(waitPeriod + 30)
        currentCount = 0
    return currentCount

In [4]:
def GetLocalActiveCompanies(api_key, location, currentRequestCount, maxCountLimit, waitPeriod, numberOfPages=None): 

    url = "https://api.company-information.service.gov.uk/advanced-search/companies?location="+ location +"&company_status=active"
    response = requests.get(url,auth=(api_key,''))
    currentRequestCount = APIRateLimitHandler(currentRequestCount, maxCountLimit, waitPeriod)
    jsonSearchResult = response.text
    searchResult = json.JSONDecoder().decode(jsonSearchResult)
    
    hits = searchResult["hits"] 
    itemsPerPage = 20
    if numberOfPages==None: numberOfPages = math.ceil(hits/itemsPerPage)

    listOfCompanyNumbers = []
    listOfCompanyNames = []
    listOfCompanyAddresses = []
    listOfCompanySICCodes = []

    for page in range(0,numberOfPages):
        pageStartIndex = page * itemsPerPage
        url = "https://api.company-information.service.gov.uk/advanced-search/companies?location="+ location +"&company_status=active&start_index="+str(pageStartIndex)

        response = requests.get(url,auth=(api_key,''))
        currentRequestCount = APIRateLimitHandler(currentRequestCount, maxCountLimit, waitPeriod)
        jsonSearchResult = response.text
        searchResult = json.JSONDecoder().decode(jsonSearchResult)
        companies = searchResult["items"]
    
        for company in companies:
            try:
                listOfCompanyNumbers.append(company["company_number"])
            except KeyError:
                listOfCompanyNumbers.append(None)
                
            try:
                listOfCompanyNames.append(company["company_name"])
            except KeyError:
                listOfCompanyNames.append(None)
            
            try:
                listOfCompanySICCodes.append(company["sic_codes"])
            except KeyError:
                listOfCompanySICCodes.append(None)
                
            try:
                addressAsDictionary = (company["registered_office_address"])
                registeredOfficeAddress = ""
                if "address_line_1" in addressAsDictionary: registeredOfficeAddress = registeredOfficeAddress + addressAsDictionary["address_line_1"]
                if "address_line_2" in addressAsDictionary: registeredOfficeAddress = registeredOfficeAddress + ", " + addressAsDictionary["address_line_2"]
                if "locality" in addressAsDictionary: registeredOfficeAddress = registeredOfficeAddress + ", " + addressAsDictionary["locality"]
                if "postal_code" in addressAsDictionary: registeredOfficeAddress = registeredOfficeAddress + ", " + addressAsDictionary["postal_code"]
                if "country" in addressAsDictionary: registeredOfficeAddress = registeredOfficeAddress + ", " + addressAsDictionary["country"]
                listOfCompanyAddresses.append(registeredOfficeAddress)
            except KeyError:
                listOfCompanyAddresses.append(None)
    
    return listOfCompanyNumbers, listOfCompanyNames, listOfCompanySICCodes, listOfCompanyAddresses, currentRequestCount

**Using GetLocalActiveCompanies:**

In [5]:
api_key = "5ef9b7c0-60f2-4fd4-b88b-083e640f6c2a"
location = "Swansea"
currentRequestCount = 0
maxCountLimit = 599 
waitPeriod = 300
numberOfPages=2
listOfCompanyNumbers, listOfCompanyNames, listOfCompanySICCodes, listOfCompanyAddresses, currentRequestCount = GetLocalActiveCompanies(api_key, location, currentRequestCount, maxCountLimit, waitPeriod, numberOfPages)

print("\n Company numbers were: \n")
print(listOfCompanyNumbers)
print("\n Company names were: \n")
print(listOfCompanyNames)
print("\n Company SIC codes were: \n")
print(listOfCompanySICCodes)
print("\n Company addresses were: \n")
print(listOfCompanyAddresses)

print("\n The number of results was: \n")
print(len(listOfCompanyNumbers))




 Company numbers were: 

['12759132', 'OC307349', '09641796', '09644485', '09674522', '09678896', '12542329', '12543573', '12546295', '12547731', '12562964', '12572217', '12576588', '12045733', '12046076', '12065560', '12166537', '12192465', '12200775', '12212399', '12216547', '11903225', '12385414', '12617191', '12643503', '12328611', '12461501', '10276734', '10677806', '10688778', '10688865', '10707932', '10745290', '10495409', '10802324', '10813861', '10550518', '10331304', '12677620', '12696180']

 Company names were: 

['SWND LTD', 'JOHN COLLINS & PARTNERS LLP', 'TYCHO PICTURES LIMITED', 'KYARA LIMITED', 'GARDEN VILLAGE STORES LIMITED', 'ACE RENTALS LIMITED', 'DSL TECHNICAL ENGINEERING LTD', 'FIRSTLINE RESIN FLOORING LIMITED', 'RDN GROUP LIMITED', 'KARUNA MARINE UK LTD', 'FLENAGER LTD', 'TITAN EQUINE LIMITED', 'SKETTY PIZZA HOUSE LIMITED', 'DRAW CONSTRUCTION (WALES) LIMITED', 'VENTURE WISE CONSULTANCY LTD', 'NATURIOL GRATES LTD', 'DCMEMARKETING LTD', 'CJL CONSTRUCTION SOLUTIONS L

In [6]:
def FormatPostcode(postcode):
    formattedPostcode = ""
    postcode = postcode.split()
    for i in range(len(postcode)):
        formattedPostcode = formattedPostcode + postcode[i] 
        if(i==0): formattedPostcode = formattedPostcode + " "
                
    return formattedPostcode

In [7]:
def GetCompanyPostcodes(listOfCompanyAddresses):
    listOfCompanyPostcodes = []
    for i in range(len(listOfCompanyAddresses)):
        address = listOfCompanyAddresses[i]
        address = address.split(',')
        for j in range(len(address)):
            if((j>1) and (any(digit.isdigit() for digit in address[j]))): postcode = address[j]
                
        postcode = FormatPostcode(postcode)
        listOfCompanyPostcodes.append(postcode)
    
    return listOfCompanyPostcodes 

In [8]:
def ProcessAddressSecondLine(streetAddress,location):
    doIWantIt = False
    listToCheckAgainst = ['road','street','park','Road','Street','Park','way','Way','lane','Lane']
    for i in range(len(listToCheckAgainst)):
        if((listToCheckAgainst[i] in streetAddress) and (streetAddress!=location)): doIWantIt = True
            
    return doIWantIt

In [9]:
def GetCompanyStreetAddress(listOfCompanyAddresses, location):
    listOfCompanyStreetAddress = []
    for i in range(len(listOfCompanyAddresses)):
        address = listOfCompanyAddresses[i]
        address = address.split(',')
        streetAddress = ""
        for j in range(len(address)):
            if(j==0): streetAddress = streetAddress + address[j]
            if((j==1) and (any(digit.isdigit() for digit in address[j])) and (ProcessAddressSecondLine(address[j],location))): 
                streetAddress = streetAddress + " " + address[j]
                

        listOfCompanyStreetAddress.append(streetAddress)
    
    return listOfCompanyStreetAddress 

In [10]:
listOfCompanyPostcodes = GetCompanyPostcodes(listOfCompanyAddresses)
print(listOfCompanyPostcodes)
print(len(listOfCompanyPostcodes))
listOfCompanyStreetAddress = GetCompanyStreetAddress(listOfCompanyAddresses, location)
print(listOfCompanyStreetAddress)
print(len(listOfCompanyStreetAddress))

['SA5 9BP', 'SA6 8QP', 'SA4 9ZJ', 'SA7 9FS', 'SA4 4HQ', 'SA8 4HU', 'SA3 4PQ', 'SA7 9LA', 'SA5 4HP', 'SA3 5AU', 'SA1 2DL', 'SA1 6AT', 'SA2 8JJ', 'SA1 4EH', 'SA1 1QP', 'SA7 0AJ', 'SA6 5AP', 'SA7 9WS', 'SA8 4HA', 'SA4 6BB', 'SA7 0AJ', 'SA1 4DH', 'SA3 3EY', 'SA1 2EF', 'SA2 0AA', 'SA3 5SU', 'SA7 0HH', 'SA3 3DQ', 'SA3 4QR', 'SA2 7LD', 'SA3 4BL', 'SA7 0AJ', 'SA6 7BP', 'SA1 6AT', 'SA4 6RW', 'SA1 3LW', 'SA4 3PD', 'SA5 4AE', 'SA4 6UF', 'SA6 7JZ']
40
['88 Heol Gerrig Treboeth', 'Venture Court Waterside Business Park Valley Way', '77 Fford Ger Y Llyn', 'Oystermouth House Charter Court', '94 Swansea Road', '4 High Street', '32 32 Cambridge Road', 'C/O Bevan Buckland Llp Ground Floor    Cardigan House', 'Unit 4-5 Bell Court Felinfach', '48 Mumbles Road', '80 New Cut Road', '11 Calvert Terrace', '18 Park Way', 'Avc House  21 Northampton Lane', '5 Prospect Place', '8 Axis Court Riverside Business Park', 'Tudor Gables Park Road', '22 Ffordd Melyn Mair', 'Alloy House Tawe Terrace', '31 Dyffryn Road', '8

## Step 2 - Profile Each Company ##

**RequestProfile(api_key, companyNumber)**  
Uses the company profile resource (within public data API) to get basic information on the date of company creation and company type. These peices of information are then extracted from the returned JSON object using the functions GetCompanyDateOfBirth and GetCompanyType.

Input - api_key, companyNumber. 
- api_key - Use your own.
- companyNumber - The unique ID number for the company of interest. These are obtained using the function GetLocalActiveCompanies.
 
Output - searchResult.  
- A JSON ojbect, which is then inspected for the required information by the associated aforementioned functions.

In [11]:
def RequestProfile(api_key, companyNumber, currentRequestCount, maxCountLimit, waitPeriod):
    
    url = "https://api.company-information.service.gov.uk/company/{}"
    response = requests.get(url.format(companyNumber),auth=(api_key,''))
    currentRequestCount = APIRateLimitHandler(currentRequestCount, maxCountLimit, waitPeriod)
    jsonSearchResult = response.text
    searchResult = json.JSONDecoder().decode(jsonSearchResult)

    return searchResult, currentRequestCount

In [12]:
def GetCompanyDateOfBirth(companyProfile):
    dateOfCompanyCreation = companyProfile['date_of_creation']
    return dateOfCompanyCreation

In [13]:
def GetCompanyType(companyProfile):
    companyType = companyProfile['type']
    return companyType

**Using RequestProfile, GetCompanyDateOfBirth and GetCompanyType**

In [14]:
companyNumber = "12141111"
api_key = "5ef9b7c0-60f2-4fd4-b88b-083e640f6c2a"
currentRequestCount = 0
maxCountLimit = 599 
waitPeriod = 300
companyProfile, currentRequestCount = RequestProfile(api_key, companyNumber, currentRequestCount, maxCountLimit, waitPeriod)

dateOfCompanyCreation = GetCompanyDateOfBirth(companyProfile)
print("\nThe date Of Company Creation was: \n")
print(dateOfCompanyCreation)

companyType = GetCompanyType(companyProfile)
print("\nThe company type was: \n")
print(companyType)

print("\nThe profile info contains: \n")
print(companyProfile)


The date Of Company Creation was: 

2019-08-06

The company type was: 

ltd

The profile info contains: 

{'links': {'filing_history': '/company/12141111/filing-history', 'persons_with_significant_control': '/company/12141111/persons-with-significant-control', 'self': '/company/12141111', 'officers': '/company/12141111/officers'}, 'etag': '0bc04c21c9af64c87c623093cd2762b6553207cb', 'company_name': 'LATEST MARKETING SOLUTIONS LTD', 'jurisdiction': 'england-wales', 'registered_office_is_in_dispute': False, 'company_status': 'active', 'has_insolvency_history': False, 'accounts': {'next_made_up_to': '2022-08-31', 'overdue': False, 'next_accounts': {'period_start_on': '2021-09-01', 'overdue': False, 'period_end_on': '2022-08-31', 'due_on': '2023-05-31'}, 'next_due': '2023-05-31', 'accounting_reference_date': {'month': '08', 'day': '31'}, 'last_accounts': {'period_start_on': '2020-09-01', 'made_up_to': '2021-08-31', 'period_end_on': '2021-08-31', 'type': 'dormant'}}, 'has_charges': False, '

**GetIDsForAnyDocumentType(api_key, companyNumber, docType, numberOfDocuments=None)**  
Returns the transaction IDs for filed company documents, along with the corresponding document IDs. These can then be used with the Filing History API to download those documents for inspection and further analysis.

Input - api_key, companyNumber, docType, numberOfDocuments in your results.  
- The docType can be accounts, confirmation statement, etc. The documentation on these types can be found at: https://developer-specs.company-information.service.gov.uk/companies-house-public-data-api/resources/filinghistorylist?v=latest .
- If numberOfDocuments=None then all available results will be provided.
- The results are chronologically ordered with the most recent results first. Thus if you request 3 results of type "confirmation-statement", they will be the 3 most recent confirmation statements, i.e. those from the previous 3 years.
 


Output - A list containing the requested transaction IDs.  
- listOfTransactionIDs - Transaction IDs for the requested number of the (most recent) document type of interest.
- listOfDocumentIDs - The corresponding document IDs.

In [15]:
def GetIDsForAnyDocumentType(api_key, companyNumber, docType, currentRequestCount, maxCountLimit, waitPeriod, numberOfDocuments=None): 

    url = "https://api.company-information.service.gov.uk/company/" + companyNumber + "/filing-history"

    response = requests.get(url,auth=(api_key,''))
    currentRequestCount = APIRateLimitHandler(currentRequestCount, maxCountLimit, waitPeriod)
    jsonSearchResult = response.text
    searchResult = json.JSONDecoder().decode(jsonSearchResult)
   
    if numberOfDocuments==None: numberOfDocuments = searchResult["total_count"] 
    numberOfDocumentsAvailable = searchResult["total_count"] 
    itemsPerPage = searchResult["items_per_page"] 
    numberOfPages = math.ceil(numberOfDocumentsAvailable/itemsPerPage)
    
    listOfTransactionIDs = []
    listOfDocumentIDs = []
    listOfDocumentDates = []
    numberOfResultsCounter = 0
    
    for page in range(0,numberOfPages):
        pageStartIndex = page * itemsPerPage
        url = "https://api.company-information.service.gov.uk/company/" +companyNumber+ "/filing-history?start_index="+str(pageStartIndex)

        response = requests.get(url,auth=(api_key,''))
        currentRequestCount = APIRateLimitHandler(currentRequestCount, maxCountLimit, waitPeriod)
        jsonSearchResult = response.text
        searchResult = json.JSONDecoder().decode(jsonSearchResult)
        documents = searchResult["items"]
    
        for document in documents:
            if (document["category"]==docType):
                
                listOfTransactionIDs.append(document["transaction_id"])                
                listOfDocumentDates.append(document["action_date"])
                
                docIDDict = document["links"] 
                urlDecompositionList = docIDDict["document_metadata"].split("/")
                listOfDocumentIDs.append(urlDecompositionList[-1])
               
                numberOfResultsCounter = numberOfResultsCounter + 1
                if (numberOfResultsCounter==numberOfDocuments): break
                    
        if (numberOfResultsCounter==numberOfDocuments): break 
        
    return listOfTransactionIDs, listOfDocumentIDs, listOfDocumentDates, currentRequestCount


**Using GetIDsForAnyDocumentType:**

In [16]:
api_key = "5ef9b7c0-60f2-4fd4-b88b-083e640f6c2a"
companyNumber = "07804652"
docType = "accounts"
numberOfDocuments = 2
currentRequestCount = 0
maxCountLimit = 599 
waitPeriod = 300

listOfTransactionIDs, listOfDocumentIDs, listOfDocumentDates, currentRequestCount= GetIDsForAnyDocumentType(api_key, companyNumber, docType, currentRequestCount, maxCountLimit, waitPeriod, numberOfDocuments) 

print("\n The transaction IDs were: \n")
print(listOfTransactionIDs)
print(len(listOfTransactionIDs))

print("\n The document IDs were: \n")
print(listOfDocumentIDs)
print(len(listOfDocumentIDs))

print("\n The document dates were: \n")
print(listOfDocumentDates)
print(len(listOfDocumentDates))


 The transaction IDs were: 

['MzM1Nzg0ODQ3M2FkaXF6a2N4', 'MzMwNDk4OTg2NWFkaXF6a2N4']
2

 The document IDs were: 

['9zs99kJx0a5dK9-76io44DV3JYbNuvLZLEBzbWR6em0', 'rLk31M6HV8LGjkooYqMxRpUwQd_daYuUfe-kCoPCgRI']
2

 The document dates were: 

['2022-03-31', '2021-03-31']
2


**GetXMLFile(api_key, documentID, documentName)**  
Requests and saves the document associated with documentID into a file named documentName.

Input - api_key, documentID, documentName. 
- api_key - Use your own. 
- documentID - The document ID for the document of interest, obtained using the function GetIDsForAnyDocumentType.

Output - None 
- The file is saved into the same directory as the code is running from, i.e. the current working directory.

In [17]:
def GetXMLFile(api_key, documentID, documentName, currentRequestCount, maxCountLimit, waitPeriod):
    url = "https://document-api.company-information.service.gov.uk/document/"+documentID+"/content"
    requestHeaders = {'Accept': 'application/xhtml+xml'}
    response = requests.get(url,auth=(api_key,''),headers=requestHeaders)
    currentRequestCount = APIRateLimitHandler(currentRequestCount, maxCountLimit, waitPeriod)

    with open(documentName, 'wb') as f:
        f.write(response.content)
    return currentRequestCount

In [18]:
def GetPDFFile(api_key, documentID, documentName, currentRequestCount, maxCountLimit, waitPeriod):
    url = "https://document-api.company-information.service.gov.uk/document/"+documentID+"/content"
    requestHeaders = {'Accept': 'application/pdf'}
    response = requests.get(url,auth=(api_key,''),headers=requestHeaders)
    currentRequestCount = APIRateLimitHandler(currentRequestCount, maxCountLimit, waitPeriod)

    with open(documentName, 'wb') as f:
        f.write(response.content)
    return currentRequestCount

**Using GetXMLFile:**  
(Using the document ID to get the document file from the filing history API.)

In [19]:
api_key = "5ef9b7c0-60f2-4fd4-b88b-083e640f6c2a"
documentID = listOfDocumentIDs[0]
documentName = "Accounts.txt"
currentRequestCount = 0
maxCountLimit = 599 
waitPeriod = 300

currentRequestCount = GetXMLFile(api_key, documentID, documentName, currentRequestCount, maxCountLimit, waitPeriod)

**GetStringFromFile(filePath, targetString1, targetString2)**  
Returns the string of interest within the specified file. Specify the file, specify the point where the data is stored within the file, i.e. "cash in bank", and the subsequent line to then extract.

Input - filePath, targetString1[], targetString2.  
- filePath - the file of interest, e.g. Accounts.txt (XML file)
- targetString1[] - the part of the text from which to begin the specific seach relating to targetString2. e.g. targetString1 = ">Cash at bank and in hand". At this point the function will then search for the first line containing an instance of targetString2 and then returns that line.
- targetString1 is an array and takes several search terms, since different size companies use different formats, such as with accounts, and so different search terms will be required. It will try each of them in turn.
- targetString2 - the pattern to search for the first instance of, starting from the location of the first instance of the pattern spcified by targetString1.
 


Output - targetLine.  
- targetLine - The first line of text from the file that matches both the patterns specifies by targetString1 and targetString2.
- This string can then be processed to extract the information of interest.  


- This function is for general purpose, whenever a single line is to be extracted from a text file and can be accurately specified by the above method of using targetString1 and targetString2.

In [20]:
def GetStringFromFile(filePath, targetString1, targetString2):
    foundLine = False
    foundSection = False
    tryNextQuery = True
    with open(filePath, 'r',encoding='utf-8') as file:
        lines = file.readlines()
        headerText = lines[0]
        headerText = headerText.split(' ')
        headerText = headerText[0]
        targetLine = "No Section Found"
        if(headerText!="<?xml"):
            targetLine = "No File"
            tryNextQuery = False
        targetCounter = 0
        while (tryNextQuery==True):
            for line in lines:
                if ((line.find(targetString1[targetCounter])!=-1) and foundSection==False):
                    foundSection = True
                    tryNextQuery = False
                    linesTried = 0
                    currentLine = lines.index(line)
                    while (foundLine==False and linesTried<10):     
                        if (targetString2 in lines[currentLine]):
                            foundLine = True
                            targetLine = lines[currentLine]
                        else:
                            currentLine = currentLine + 1
                            linesTried = linesTried + 1
                    if(linesTried==10): 
                        targetLine = "No Line Found"
                        if(targetCounter<len(targetString1)):
                            tryNextQuery = True
                            foundSection = False
                            
            targetCounter = targetCounter + 1 
    
    return targetLine

**ProcessLineCashAtBank(targetLine)**  
Returns the financial data for the cash in bank and in hand from the input string.

Input - targetLine.  
- targetLine - The string containing the financial information, obtained using the function GetStringFromFile.
 


Output - targetValue.  
- targetValue - The financial data for cash in bank and in hand, returned as an integer in units of GBP.

In [21]:
def ProcessLineCashAtBank(targetLine):
    if(targetLine=="No File" or targetLine=="No Line Found" or targetLine=="No Section Found"):
        targetValue = targetLine
    else:
        targetValueList = targetLine.split(">")
        if(targetValueList[-1]=="\n"):
            targetValueList.remove("\n")
        if("</div" in targetValueList[-1]):
            del targetValueList[-1]
        targetValueList = targetValueList[-1].split("<")
        targetValueList = targetValueList[0].split(",")

        targetValue = ""
        for i in range(0,len(targetValueList)):
            targetValue = targetValue + str(targetValueList[i])
        targetValue = int(targetValue)
        
    return targetValue

**Using GetStringFromFile and ProcessLineCashAtBank**

In [22]:
filePath = "Accounts.txt"
targetString1 = [">Cash at bank and in hand<",">CAPITAL AND RESERVES<", "reserves"]
targetString2 = "</ix:nonFraction>" 

targetLine = GetStringFromFile(filePath, targetString1, targetString2)
targetValue = ProcessLineCashAtBank(targetLine)   

print("\nThe cash in bank and in hand value in GBP is: \n")
print(targetValue)
print(type(targetValue))


The cash in bank and in hand value in GBP is: 

21635
<class 'int'>
