# Scraping the Gazette for insolvency notices (sequential)

Search results are at https://m.thegazette.co.uk/insolvency?categorycode=G205010000&results-page=1 for company insolvencies. Personal insolvencies can be searched separately.

Each notice has a number like so: https://m.thegazette.co.uk/notice/3552846

Simply cycling through catches all types of notice including both business and personal.

## Install the libraries

Firstly we install scraperwiki and some other libraries we might need.

In [None]:
#Install libraries
!pip install scraperwiki
import scraperwiki
import lxml.html
!pip install cssselect
import cssselect

In [None]:
#import pandas
import pandas as pd

## Scrape a single page

Next we see if we can scrape a single page.

In [None]:
#Store a URL we want to work back from
latesturl = "https://m.thegazette.co.uk/notice/3391189"

In [None]:
print("scraping",latesturl)
#use the scrape function on that url
html = scraperwiki.scrape(latesturl)
# turn that variable's contents into an lxml object, making it easier to drill into
root = lxml.html.fromstring(html) 

#The basic info is all in <div class="notice-data"> and then <dd>
noticedata = root.cssselect('div.notice-data dd')
#How many matches
noticecategory = noticedata[0].text_content()
noticetype = noticedata[1].text_content()
pubdate = noticedata[2].text_content()
print(noticecategory, noticetype)
#The notice codes provide further detail
#See https://www.thegazette.co.uk/noticecodes
#We need to identify which ≤dt> has 'Notice code' to scrape the relevant <dd>
noticedlabels = root.cssselect('div.notice-data dt')
#Measure the list so we can loop
howlong = len(noticedlabels)
#Loop through numbers
#Set a default value first
noticecode = "NO DATA"
for i in range(0,howlong):
  #Check the text at that index in that list of labels
  if noticedlabels[i].text_content() == 'Notice code:':
    #If it does, grab the corresponding index from the other list of values
    noticecode = noticedata[i].text_content()

#Create a dictionary
record = {}
record['latesturl'] = latesturl
record['noticecategory'] = noticecategory
record['noticetype'] = noticetype
record['noticecode'] = noticecode

#Other data is in helpful data-gazettes attributes like so:
companynames = root.cssselect('[data-gazettes="CompanyName"]')
if len(companynames)>0:
  companyname = companynames[0].text_content()
else:
  companyname = "NO DATA"

compnums = root.cssselect('[data-gazettes="CompanyNumber"]')
if len(compnums)>0:
  compnum = compnums[0].text_content()
else:
  compnum = "NO DATA"

record['compnum'] = compnum
record['companyname'] = companyname

addresses = root.cssselect('[data-gazettes="CompanyRegisteredOffice"]')
if len(addresses)>0:
  address = addresses[0].text_content()
else:
  address = "NO DATA"

typesofliq = root.cssselect('[data-gazettes="TypeOfLiquidation"]')
if len(typesofliq)>0:
  typeofliq = typesofliq[0].text_content()
else:
  typeofliq = "NO DATA"


record['compnum'] = compnum
record['address'] = address
record['typeofliq'] = typeofliq

print(record)

scraping https://m.thegazette.co.uk/notice/3391189
Corporate Insolvency Appointment of Liquidators
{'latesturl': 'https://m.thegazette.co.uk/notice/3391189', 'noticecategory': 'Corporate Insolvency', 'noticetype': 'Appointment of Liquidators', 'noticecode': '2432', 'compnum': '07534757', 'companyname': 'KERR ADVISORY LIMITED', 'address': 'Registered office: Westminster Business Centre, 10 Great North Way, Nether Poppleton, York, YO26 6RB', 'typeofliq': "Type of Liquidation: Members' Voluntary Liquidation"}


## Create a function

That works, so let's store it in a function. We also add some lines for personal insolvency.

In [None]:
def scrapedetail(url):
  print("scraping",url)
  #use the scrape function on that url
  html = scraperwiki.scrape(url)
  # turn that variable's contents into an lxml object, making it easier to drill into
  root = lxml.html.fromstring(html) 
  #The basic info is all in <div class="notice-data"> and then <dd>
  noticedata = root.cssselect('div.notice-data dd')
  #The first and second and third items contain data we store
  noticecategory = noticedata[0].text_content()
  noticetype = noticedata[1].text_content()
  pubdate = noticedata[2].text_content()
  #The notice codes provide further detail
  #See https://www.thegazette.co.uk/noticecodes
  #We need to identify which ≤dt> has 'Notice code' to scrape the relevant <dd>
  noticedlabels = root.cssselect('div.notice-data dt')
  #Measure the list so we can loop
  howlong = len(noticedlabels)
  #Loop through numbers
  #Set a default value first
  noticecode = "NO DATA"
  for i in range(0,howlong):
    #Check the text at that index in that list of labels
    if noticedlabels[i].text_content() == 'Notice code:':
      #If it does, grab the corresponding index from the other list of values
      noticecode = noticedata[i].text_content()
  #Create a dictionary
  record = {}
  record['url'] = url
  record['noticecategory'] = noticecategory
  record['noticetype'] = noticetype
  record['noticecode'] = noticecode
  record['pubdate'] = pubdate
  #Other data is in helpful data-gazettes attributes like so:
  companynames = root.cssselect('[data-gazettes="CompanyName"]')
  if len(companynames)>0:
    companyname = companynames[0].text_content()
  else:
    companyname = "NO DATA"
  compnums = root.cssselect('[data-gazettes="CompanyNumber"]')
  if len(compnums)>0:
    compnum = compnums[0].text_content()
  else:
    compnum = "NO DATA"
  record['compnum'] = compnum
  record['companyname'] = companyname
  addresses = root.cssselect('[data-gazettes="CompanyRegisteredOffice"]')
  if len(addresses)>0:
    address = addresses[0].text_content()
  else:
    address = "NO DATA"
  dobs = root.cssselect('[data-gazettes="BirthDetails"]')
  if len(dobs)>0:
    dob = dobs[0].text_content()
  else:
    dob = "NO DATA"
  persdetails = root.cssselect('[data-gazettes="PersonDetails"]')
  if len(persdetails)>0:
    persdetail = persdetails[0].text_content()
  else:
    persdetail = "NO DATA"
  record['compnum'] = compnum
  record['address'] = address
  record['dob'] = dob
  record['persdetail'] = persdetail

  typesofliq = root.cssselect('[data-gazettes="TypeOfLiquidation"]')
  if len(typesofliq)>0:
    typeofliq = typesofliq[0].text_content()
  else:
    typeofliq = "NO DATA"
  record['typeofliq'] = typeofliq
  return(record)

Now to run that.

In [None]:
testrecord = scrapedetail("https://m.thegazette.co.uk/notice/3391189")
print(testrecord)

scraping https://m.thegazette.co.uk/notice/3391189
{'url': 'https://m.thegazette.co.uk/notice/3391189', 'noticecategory': 'Corporate Insolvency', 'noticetype': 'Appointment of Liquidators', 'noticecode': '2432', 'pubdate': '23 September 2019', 'compnum': '07534757', 'companyname': 'KERR ADVISORY LIMITED', 'address': 'Registered office: Westminster Business Centre, 10 Great North Way, Nether Poppleton, York, YO26 6RB', 'dob': 'NO DATA', 'persdetail': 'NO DATA', 'typeofliq': "Type of Liquidation: Members' Voluntary Liquidation"}


So that works. Now to create a loop and prepare to save the results.

## Create a loop

The last number is 3391187 so we need to start from there and go backwards. We will try going 10 back from there to begin with.

The latest is 3581867 so we'll go back from there too.

In [None]:
startnum = 3581867
endnum = startnum-10
#When going backwards you have to add a negative 'step' argument
for i in range(startnum, endnum, -1):
  #We have to convert to string to add to URL
  print('https://m.thegazette.co.uk/notice/'+str(i))

Now to run the function on each URL:

In [None]:
startnum = 3581867
endnum = startnum-5000
#When going backwards you have to add a negative 'step' argument
for i in range(startnum, endnum, -1):
  #Some URLs are broken so to stop those breaking the scraper we use try/except
  try:
    #We have to convert to string to add to URL
    scraperesult = scrapedetail('https://m.thegazette.co.uk/notice/'+str(i))
    print(scraperesult)
  except:
    print("problem")

## Store the results

Earlier we imported pandas - now we use it to store the results.

In [None]:
#install library to export files
from google.colab import files

In [None]:
#Create a dataframe to store data
df = pd.DataFrame(columns=["notices"])

startnum = 3391187-50000
endnum = startnum-10000
#When going backwards you have to add a negative 'step' argument
for i in range(startnum, endnum, -1):
  #Some URLs are broken so to stop those breaking the scraper we use try/except
  try:
    #We have to convert to string to add to URL
    scraperesult = scrapedetail('https://m.thegazette.co.uk/notice/'+str(i))
    #print(scraperesult)
    df = df.append(scraperesult, ignore_index=True)
  except:
    print("problem")

#Print the dataframe to check
#print(df)
#Export to csv
df.to_csv("notices.csv")
#Download it automatically in case the runtime disconnects before we check and do it manually
files.download('notices.csv') 