# Scraping the Courts and Tribunals Judiciary Website to collect Prevent Future Death (PFD) reports 

In [None]:
from requests import get
from requests import ConnectionError
from bs4 import BeautifulSoup
import re
from time import sleep
from time import time
import csv
import pandas as pd
from tqdm.auto import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def get_url(url):
    response = get(url, verify = False)
    html = response.content
    soup = BeautifulSoup(html, "html.parser")
    return soup
def retries(record_url, tries=3):
    for i in range(tries):
        try:
            soup = get_url(record_url)
            return soup
        except (ConnectionError, SSLError):
            if i < tries - 1:
                sleep(2)
                continue
            else:
                return 'Con error'

# Qingyang
import numpy as np

# Scraper starts here - last run started on Wednesday 22nd June 9:34am

To save time, simply upload record_urls from documents/Summer internship 22/ instead of actually scraping

In [None]:
record_urls_df = pd.read_csv("record_urls")

record_urls = record_urls_df.iloc[:, 1].values.tolist()
print(record_urls)


['https://www.judiciary.uk/publications/saifur-rahman-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/raymond-gillespie-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/michael-wysockyj-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/pauline-keen-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/sangeerth-girirathan-prevention-of-future-deaths-reports/', 'https://www.judiciary.uk/publications/hassan-zubair-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/aliny-godinho-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/matthew-evans-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/susan-carling-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/marjorie-grayson-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/connor-wellsted-prevention-of-fut

In [None]:
#Your second value in "range" will be one more than the number of pages that exist on the wesbite

# This takes 3m -6m to run
max_page = 401
pages = range(1,max_page+1)

#This loops through all the pages to get the URLs to individual records
page_string = 'https://www.judiciary.uk/subject/prevention-of-future-deaths/page/{}/'
record_urls = []
for page in tqdm(pages):
    soup = get_url(page_string.format(str(page)))
    h5s = soup.find_all('h5', {'class': 'entry-title'})
    for h5 in h5s:
        record_urls.append(h5.a.get('href'))

  0%|          | 0/401 [00:00<?, ?it/s]

Here we check how many records (i.e. cases) were pulled from the urls & the first and last case

In [None]:
len(record_urls)
#4010 on 21st June 17:25 2022

4010

In [None]:
record_urls[0]

'https://www.judiciary.uk/publications/saifur-rahman-prevention-of-future-deaths-report/'

In [None]:
record_urls[-1]

'https://www.judiciary.uk/publications/phillip-pratt/'

Here is my second loop. This will go through the lists of URLs I just created above to visit each individual record and pull out and store the text data (info on the decreased/case) and the PDF URL I will use later

In [None]:
def error_details(e_dict, record_count, record_url, details):
    e_dict['index'] = record_count
    e_dict['url'] = record_url
    e_dict['reason'] = details
    return e_dict

In [None]:
reg_exp = re.compile(r"’s\s|s\s|'s\s")
text_cats = ['Date of report', 'Ref', 'Deceased name', 'Coroner name', 'Coroner Area', 'Category', "This report is being sent to"] 
# text_cats later becomes column titles for df
#First, I create two lists, one for the PDFs and one for the text data
record_text = []
pdf_urls = []
ref_list = []
#I want to loop through each URL & pull out the death information and pdf link for downloading
error_catching = []

The code in the cell below takes 25-50m to run. 

In [None]:
record_count = 0
for record_url in tqdm(record_urls):
    try:
        error_dict = {}
        #Calling the retries function
        soup = retries(record_url, tries=5)
        
        if soup == 'Con error':
            print(f"{record_url} could not connect")
            error_catching.append(error_details(error_dict, record_count, record_url, 'Connection Error'))
            record_count +=1
            continue

        #This gets all the text fields from the website to work with
        death_info = soup.find('div', {'class':'entry-content'}).find_all('p')
        
        if not death_info:
            print(f"{record_url} produced no data")
            error_catching.append(error_details(error_dict, record_count, record_url, 'No Text Loaded'))
            record_count +=1
            continue
            
        #Our dictionary that will hold all of the text information that we will eventually append to "record_text"
        blankdict = {}
        
        #This is to handle 1 annoying record with messed up html tags
        if record_url == 'https://www.judiciary.uk/publications/roadsafety/':
            strong = death_info[0].find_all('strong')
            heads = ['date_of_report', 'ref', 'deceased_name', 'coroner_name', 'coroner_area', 'category']
            for st, h in zip(strong,heads):
                blankdict[h] = st.next_sibling.replace(':','').replace('Ref','').strip()
        #And another record with wonky html
        elif record_url == 'https://www.judiciary.uk/publications/helen-sheath/':
            brs = death_info[0].text.split('\n')
            vals = []
            for b in brs:
                vals.append(b.split(':'))
            for v in vals:
                if v[0] == "Coroners name":
                    alt = "coroner_name"
                    blankdict[alt] = v[1].strip().replace('\n','')
                elif v[0] == "Coroners Area":
                    alt = "coroner_area"
                    blankdict[alt] = v[1].strip().replace('\n','')
                else:
                    blankdict[v[0].strip().replace(' ','_').lower()] = v[1].strip().replace('\n','')
        elif record_url=='https://www.judiciary.uk/publications/helen-sheath/': 
            blankdict = {'date_of_report': '27 January 2020', 'ref': '2020-0107', 'deceased_name': 'Helen Sheath', 'coroner_name': 'Emma Whitting', 'coroner_area': 'Bedfordshire and Luton', 
                         'category': 'Emergency Services related deaths; Mental Health related deaths; Other related deaths', 
                         'this_report_is_being_sent_to': "Emergency Call Prioritisation Advisory Group (ECPAG); Association of Ambulance Chief Executives; National Association of Ambulance Medical Directors", 
                         'url': record_url}
        else:        
            #looping through all of the text categories for handling
            p_previous = ''
            
            for p in death_info:
                # Qingyang: concatenate p from the previous iteration with the current one. 
                # concatenation only works for strings, so I've changed the p.text of the entire loop to just be p
                p = p.text.strip().strip('<p>').strip('</p>')
                p = str(p_previous) + str(p)
                #This checks for blank fields and if there is nothing, it skips it
                
                if p.strip() == '' or p.strip() ==  ' ':
                    pass
                
                # Qingyang: Need to account for a syntax error for five PFDs, 
                # where items in their categories start on a new line so are mistaken as new headings

                elif p[-1]=='|': # if a string ends in "|" it's meant to be connected to the next line, so we need to merge two paragraphs    
                    
                    p_previous = p 
                    print("|</p> accounted for")
                    continue
                #This checks for our "Normal" case in which a colon exists and the category is one of the ones we 
                #pre-specified above in the "text_cats" list
                #We also need to account here for one strange record for "Rebecca Evans" which has a weird text error
                #That we manually correct for
                elif ':' in p and p.split(':')[0] in text_cats and not 'Rebecca-EvansR.pdf' in p:
                    #Simply assigning the key and value from strings on either side of the colon, making everything 
                    #lower case and replacing spaces with underscores and also removing any stray semi-colons
                    text_list = p.split(':')
                    blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','').replace('\xa0','')

                elif 'Rebecca-EvansR.pdf' in p:
                    #This deals with that singular odd record that currently exists as of 8 Nov 2019
                    blankdict['category'] = p.split(':')[1].strip().replace('\n','')
                    
                elif ':' not in p:
                    #If the string doesn't have a colon, we can't split on it so have to get it into dictionary format
                    #Using an alternate method that counts the length of the thing
                    if any(x in p for x in text_cats):
                        t = [x for x in text_cats if x in p][0]
                        l = len(t)
                        blankdict[t.replace(' ','_').lower()] = p[l+1:].replace('\n','').replace('\xa0','')
                    elif 'Coroners Area' in p:
                        blankdict['coroner_area'] = p[13:].strip().replace('\n','').replace('\xa0','')
                    else:
                        print("Something we haven't accounted for has happened in " + str(record_count) + '. URL: ' + record_url) 
                        # Qingyang: to better understand what that "Something" is, print the url out and the position in the list of urls

                elif p.strip().count(":") > 1: 
                    #This corrects for one odd record in which there are 2 colons but should generalize to fix it for
                    #any time this could happen, so long as it happens in the same way
                    #Qingyang: changed p.text.strip().count(":")==2 to p.text.strip().count(":") >1
                    text_list = p.split(':')
                    new_string = text_list[0] + text_list[1]
                    new_name = re.sub(reg_exp, ' ', new_string).strip()
                    blankdict[new_name.replace(' ','_').lower()] = text_list[2].strip().replace('\n','').replace('\xa0','')

                elif ':' in p and p.split(':')[0] not in text_cats: # field names are not conventional and there are colons
                    #Some field names are in the form of "name_of_deceased" or "name_of_coroner" or are plural/
                    #possessive so this smashes those into our preferred naming formats
                    if 'Name of' in p:
                        all_text = p.split(':')
                        key_name = all_text[0].split(' ')
                        blankdict[key_name[2].strip() + '_name'] = all_text[-1].strip()
                    else:    
                        new_name = re.sub(reg_exp, ' ', p)
                        text_list = new_name.split(':')
                        blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','').replace('\xa0','')
                p_previous = ''
        blankdict['url'] = record_url
        
        #A small little check for duplicated ref names
        try:
            if not blankdict['ref']:
                pass
            elif blankdict['ref'] in ref_list:
                blankdict['ref'] = blankdict['ref'] + 'A'
            ref_list.append(blankdict['ref'])
        except KeyError:
            blankdict['ref'] = ''
            
        #This appends the final dict to the list
        record_text.append(blankdict)
        
        #this is a seperate process to get the PDF URLs (no matter how many there are) and adds them to their own list   
        urls = soup.find_all('li', {'class':'pdf'})
        for url in urls:
            pdf_urls.append(url.findNext('a').get('href'))
        
        record_count += 1
        
    except Exception as e:
        import sys
        error_desc = f"{str(e)} occurred for {record_url} when trying to work with {p}"
        print(error_desc)
        error_catching.append(error_details(error_dict, record_count, record_url, error_desc))
        
        #Saving this in case we don't like the error catching.
        #import sys
        #raise type(e)(str(e) + '\n' + 'Error for Record: {}, Field: {}'.format(record_url, p)).with_traceback(sys.exc_info()[2])

  0%|          | 0/4010 [00:00<?, ?it/s]

|</p> accounted for
|</p> accounted for
|</p> accounted for
|</p> accounted for
|</p> accounted for
https://www.judiciary.uk/publications/railwayrelateddeaths/ produced no data
https://www.judiciary.uk/publications/service-personnel-deaths/ produced no data
https://www.judiciary.uk/publications/product/ produced no data
https://www.judiciary.uk/publications/policerelateddeaths/ produced no data
https://www.judiciary.uk/publications/carehomehealth/ produced no data
https://www.judiciary.uk/publications/statecustodydeath/ produced no data
https://www.judiciary.uk/publications/hospitaldeath/ produced no data
https://www.judiciary.uk/publications/drugsalcoholmedication/ produced no data
https://www.judiciary.uk/publications/commhealthcareemergencyservices/ produced no data


Qingyang: There are five new unaccounted errors (184, 284, 319, 513, 554) The error on these five are all the same: The category names are so long that the code thinks that one of the categories is a header. Therefore, it thinks it's a header without a colon, so it goes into the "Something we haven't accounted for category"

There are two SSLError is not defined occurred for two links: 

https://www.judiciary.uk/publications/ben-king-prevention-of-future-deaths-report/


https://www.judiciary.uk/publications/steven-allen/


5 "Something we haven't accounted for has happened"

In record_urls[184] [284] [319] [515] [555] - now fixed

9 "Produced no data"

# Problem: With hindsight, there is an entry (coroner was Emma Whitting) that actually was able to pull the date data, but was not able to pull any other data. Date 27th January 2020: https://www.judiciary.uk/publications/helen-sheath/

In [None]:
#Any errors should print out above, but you can also check the error_catching dict
#Here we just turn it into a dataframe quickly to easily view

error_df = pd.DataFrame(error_catching)
error_df

Unnamed: 0,index,url,reason
0,3939,https://www.judiciary.uk/publications/railwayr...,No Text Loaded
1,3940,https://www.judiciary.uk/publications/service-...,No Text Loaded
2,3941,https://www.judiciary.uk/publications/product/,No Text Loaded
3,3942,https://www.judiciary.uk/publications/policere...,No Text Loaded
4,3943,https://www.judiciary.uk/publications/carehome...,No Text Loaded
5,3944,https://www.judiciary.uk/publications/statecus...,No Text Loaded
6,3947,https://www.judiciary.uk/publications/hospital...,No Text Loaded
7,4005,https://www.judiciary.uk/publications/drugsalc...,No Text Loaded
8,4006,https://www.judiciary.uk/publications/commheal...,No Text Loaded


This is my final step that puts the text data (info on the deceased/case) into a csv file & adds the date it was pulled

In [None]:
from datetime import date

headers = ['date_of_report', 'date_of_reports', 'ref', 'deceased_name', 'deceased_names', 'coroner_name', 'coroner_area', 'category', 'this_report_is_being_sent_to', 'these_report_are_being_sent_to', 'these_report_have_been_sent_to', 'url']

with open('death_info_{}.csv'.format(date.today()), 'w', newline='', encoding='utf-8') as deaths_csv:
    writer = csv.DictWriter(deaths_csv, fieldnames=headers)
    writer.writeheader()
    for record in record_text:
        if record == {}:
            pass
        else:
            writer.writerow(record)

In [None]:
df = pd.read_csv('death_info_2022-06-22.csv')
display(df.head())
print(df.shape)

Unnamed: 0,date_of_report,date_of_reports,ref,deceased_name,deceased_names,coroner_name,coroner_area,category,this_report_is_being_sent_to,these_report_are_being_sent_to,these_report_have_been_sent_to,url
0,26 May 2022,,2022-0155,Saifur Rahman,,James Bennett,Birmingham and Solihull,State Custody related deaths | Mental Health r...,Ministry of Justice and Birmingham and Solihul...,,,https://www.judiciary.uk/publications/saifur-r...
1,25 May 2022,,2022-0154,Raymond Gillespie,,Kate Sutherland,North Wales (East & Central),Emergency services related deaths (2019 onward...,Welsh Ambulance NHS Foundation Trust and Betsi...,,,https://www.judiciary.uk/publications/raymond-...
2,24 May 2022,,2022-0153,Michael Wysockyj,,Jacqueline Lake,Norfolk,Hospital Death (Clinical Procedures and medica...,Queen Elizabeth Hospital King’s Lynn NHS Found...,,,https://www.judiciary.uk/publications/michael-...
3,12 May 2022,,2022-0152,Pauline Keen,,Joanne Andrews,North East Kent,Hospital Death (Clinical Procedures and medica...,Kent and Medway NHS Social Care Partnership Tr...,,,https://www.judiciary.uk/publications/pauline-...
4,,19 May 2022,2022-0151,Sangeerth Girirathan,,Tom Osborne,Milton Keynes,Hospital Death (Clinical Procedures and medica...,,,Milton Keyne University Hospital and Secretary...,https://www.judiciary.uk/publications/sangeert...


(4001, 12)


# Scraping the Courts and Tribunals Judiciary Website to collect Prevent Future Death (PFD) reports 

In [None]:
from requests import get
from requests import ConnectionError
from bs4 import BeautifulSoup
import re
from time import sleep
from time import time
import csv
import pandas as pd
from tqdm.auto import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def get_url(url):
    response = get(url, verify = False)
    html = response.content
    soup = BeautifulSoup(html, "html.parser")
    return soup
def retries(record_url, tries=3):
    for i in range(tries):
        try:
            soup = get_url(record_url)
            return soup
        except (ConnectionError, SSLError):
            if i < tries - 1:
                sleep(2)
                continue
            else:
                return 'Con error'

# Qingyang
import numpy as np

# Scraper starts here - last run started on Wednesday 22nd June 9:34am

To save time, simply upload record_urls from documents/Summer internship 22/ instead of actually scraping

In [None]:
record_urls_df = pd.read_csv("record_urls")

record_urls = record_urls_df.iloc[:, 1].values.tolist()
print(record_urls)


['https://www.judiciary.uk/publications/saifur-rahman-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/raymond-gillespie-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/michael-wysockyj-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/pauline-keen-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/sangeerth-girirathan-prevention-of-future-deaths-reports/', 'https://www.judiciary.uk/publications/hassan-zubair-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/aliny-godinho-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/matthew-evans-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/susan-carling-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/marjorie-grayson-prevention-of-future-deaths-report/', 'https://www.judiciary.uk/publications/connor-wellsted-prevention-of-fut

In [None]:
#Your second value in "range" will be one more than the number of pages that exist on the wesbite

# This takes 3m -6m to run
max_page = 401
pages = range(1,max_page+1)

#This loops through all the pages to get the URLs to individual records
page_string = 'https://www.judiciary.uk/subject/prevention-of-future-deaths/page/{}/'
record_urls = []
for page in tqdm(pages):
    soup = get_url(page_string.format(str(page)))
    h5s = soup.find_all('h5', {'class': 'entry-title'})
    for h5 in h5s:
        record_urls.append(h5.a.get('href'))

  0%|          | 0/401 [00:00<?, ?it/s]

Here we check how many records (i.e. cases) were pulled from the urls & the first and last case

In [None]:
len(record_urls)
#4010 on 21st June 17:25 2022

4010

In [None]:
record_urls[0]

'https://www.judiciary.uk/publications/saifur-rahman-prevention-of-future-deaths-report/'

In [None]:
record_urls[-1]

'https://www.judiciary.uk/publications/phillip-pratt/'

Here is my second loop. This will go through the lists of URLs I just created above to visit each individual record and pull out and store the text data (info on the decreased/case) and the PDF URL I will use later

In [None]:
def error_details(e_dict, record_count, record_url, details):
    e_dict['index'] = record_count
    e_dict['url'] = record_url
    e_dict['reason'] = details
    return e_dict

In [None]:
reg_exp = re.compile(r"’s\s|s\s|'s\s")
text_cats = ['Date of report', 'Ref', 'Deceased name', 'Coroner name', 'Coroner Area', 'Category', "This report is being sent to"] 
# text_cats later becomes column titles for df
#First, I create two lists, one for the PDFs and one for the text data
record_text = []
pdf_urls = []
ref_list = []
#I want to loop through each URL & pull out the death information and pdf link for downloading
error_catching = []

The code in the cell below takes 25-50m to run. 

In [None]:
record_count = 0
for record_url in tqdm(record_urls):
    try:
        error_dict = {}
        #Calling the retries function
        soup = retries(record_url, tries=5)
        
        if soup == 'Con error':
            print(f"{record_url} could not connect")
            error_catching.append(error_details(error_dict, record_count, record_url, 'Connection Error'))
            record_count +=1
            continue

        #This gets all the text fields from the website to work with
        death_info = soup.find('div', {'class':'entry-content'}).find_all('p')
        
        if not death_info:
            print(f"{record_url} produced no data")
            error_catching.append(error_details(error_dict, record_count, record_url, 'No Text Loaded'))
            record_count +=1
            continue
            
        #Our dictionary that will hold all of the text information that we will eventually append to "record_text"
        blankdict = {}
        
        #This is to handle 1 annoying record with messed up html tags
        if record_url == 'https://www.judiciary.uk/publications/roadsafety/':
            strong = death_info[0].find_all('strong')
            heads = ['date_of_report', 'ref', 'deceased_name', 'coroner_name', 'coroner_area', 'category']
            for st, h in zip(strong,heads):
                blankdict[h] = st.next_sibling.replace(':','').replace('Ref','').strip()
        #And another record with wonky html
        elif record_url == 'https://www.judiciary.uk/publications/helen-sheath/':
            brs = death_info[0].text.split('\n')
            vals = []
            for b in brs:
                vals.append(b.split(':'))
            for v in vals:
                if v[0] == "Coroners name":
                    alt = "coroner_name"
                    blankdict[alt] = v[1].strip().replace('\n','')
                elif v[0] == "Coroners Area":
                    alt = "coroner_area"
                    blankdict[alt] = v[1].strip().replace('\n','')
                else:
                    blankdict[v[0].strip().replace(' ','_').lower()] = v[1].strip().replace('\n','')
        elif record_url=='https://www.judiciary.uk/publications/helen-sheath/': 
            blankdict = {'date_of_report': '27 January 2020', 'ref': '2020-0107', 'deceased_name': 'Helen Sheath', 'coroner_name': 'Emma Whitting', 'coroner_area': 'Bedfordshire and Luton', 
                         'category': 'Emergency Services related deaths; Mental Health related deaths; Other related deaths', 
                         'this_report_is_being_sent_to': "Emergency Call Prioritisation Advisory Group (ECPAG); Association of Ambulance Chief Executives; National Association of Ambulance Medical Directors", 
                         'url': record_url}
        else:        
            #looping through all of the text categories for handling
            p_previous = ''
            
            for p in death_info:
                # Qingyang: concatenate p from the previous iteration with the current one. 
                # concatenation only works for strings, so I've changed the p.text of the entire loop to just be p
                p = p.text.strip().strip('<p>').strip('</p>')
                p = str(p_previous) + str(p)
                #This checks for blank fields and if there is nothing, it skips it
                
                if p.strip() == '' or p.strip() ==  ' ':
                    pass
                
                # Qingyang: Need to account for a syntax error for five PFDs, 
                # where items in their categories start on a new line so are mistaken as new headings

                elif p[-1]=='|': # if a string ends in "|" it's meant to be connected to the next line, so we need to merge two paragraphs    
                    
                    p_previous = p 
                    print("|</p> accounted for")
                    continue
                #This checks for our "Normal" case in which a colon exists and the category is one of the ones we 
                #pre-specified above in the "text_cats" list
                #We also need to account here for one strange record for "Rebecca Evans" which has a weird text error
                #That we manually correct for
                elif ':' in p and p.split(':')[0] in text_cats and not 'Rebecca-EvansR.pdf' in p:
                    #Simply assigning the key and value from strings on either side of the colon, making everything 
                    #lower case and replacing spaces with underscores and also removing any stray semi-colons
                    text_list = p.split(':')
                    blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','').replace('\xa0','')

                elif 'Rebecca-EvansR.pdf' in p:
                    #This deals with that singular odd record that currently exists as of 8 Nov 2019
                    blankdict['category'] = p.split(':')[1].strip().replace('\n','')
                    
                elif ':' not in p:
                    #If the string doesn't have a colon, we can't split on it so have to get it into dictionary format
                    #Using an alternate method that counts the length of the thing
                    if any(x in p for x in text_cats):
                        t = [x for x in text_cats if x in p][0]
                        l = len(t)
                        blankdict[t.replace(' ','_').lower()] = p[l+1:].replace('\n','').replace('\xa0','')
                    elif 'Coroners Area' in p:
                        blankdict['coroner_area'] = p[13:].strip().replace('\n','').replace('\xa0','')
                    else:
                        print("Something we haven't accounted for has happened in " + str(record_count) + '. URL: ' + record_url) 
                        # Qingyang: to better understand what that "Something" is, print the url out and the position in the list of urls

                elif p.strip().count(":") > 1: 
                    #This corrects for one odd record in which there are 2 colons but should generalize to fix it for
                    #any time this could happen, so long as it happens in the same way
                    #Qingyang: changed p.text.strip().count(":")==2 to p.text.strip().count(":") >1
                    text_list = p.split(':')
                    new_string = text_list[0] + text_list[1]
                    new_name = re.sub(reg_exp, ' ', new_string).strip()
                    blankdict[new_name.replace(' ','_').lower()] = text_list[2].strip().replace('\n','').replace('\xa0','')

                elif ':' in p and p.split(':')[0] not in text_cats: # field names are not conventional and there are colons
                    #Some field names are in the form of "name_of_deceased" or "name_of_coroner" or are plural/
                    #possessive so this smashes those into our preferred naming formats
                    if 'Name of' in p:
                        all_text = p.split(':')
                        key_name = all_text[0].split(' ')
                        blankdict[key_name[2].strip() + '_name'] = all_text[-1].strip()
                    else:    
                        new_name = re.sub(reg_exp, ' ', p)
                        text_list = new_name.split(':')
                        blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','').replace('\xa0','')
                p_previous = ''
        blankdict['url'] = record_url
        
        #A small little check for duplicated ref names
        try:
            if not blankdict['ref']:
                pass
            elif blankdict['ref'] in ref_list:
                blankdict['ref'] = blankdict['ref'] + 'A'
            ref_list.append(blankdict['ref'])
        except KeyError:
            blankdict['ref'] = ''
            
        #This appends the final dict to the list
        record_text.append(blankdict)
        
        #this is a seperate process to get the PDF URLs (no matter how many there are) and adds them to their own list   
        urls = soup.find_all('li', {'class':'pdf'})
        for url in urls:
            pdf_urls.append(url.findNext('a').get('href'))
        
        record_count += 1
        
    except Exception as e:
        import sys
        error_desc = f"{str(e)} occurred for {record_url} when trying to work with {p}"
        print(error_desc)
        error_catching.append(error_details(error_dict, record_count, record_url, error_desc))
        
        #Saving this in case we don't like the error catching.
        #import sys
        #raise type(e)(str(e) + '\n' + 'Error for Record: {}, Field: {}'.format(record_url, p)).with_traceback(sys.exc_info()[2])

  0%|          | 0/4010 [00:00<?, ?it/s]

|</p> accounted for
|</p> accounted for
|</p> accounted for
|</p> accounted for
|</p> accounted for
https://www.judiciary.uk/publications/railwayrelateddeaths/ produced no data
https://www.judiciary.uk/publications/service-personnel-deaths/ produced no data
https://www.judiciary.uk/publications/product/ produced no data
https://www.judiciary.uk/publications/policerelateddeaths/ produced no data
https://www.judiciary.uk/publications/carehomehealth/ produced no data
https://www.judiciary.uk/publications/statecustodydeath/ produced no data
https://www.judiciary.uk/publications/hospitaldeath/ produced no data
https://www.judiciary.uk/publications/drugsalcoholmedication/ produced no data
https://www.judiciary.uk/publications/commhealthcareemergencyservices/ produced no data


Qingyang: There are five new unaccounted errors (184, 284, 319, 513, 554) The error on these five are all the same: The category names are so long that the code thinks that one of the categories is a header. Therefore, it thinks it's a header without a colon, so it goes into the "Something we haven't accounted for category"

There are two SSLError is not defined occurred for two links: 

https://www.judiciary.uk/publications/ben-king-prevention-of-future-deaths-report/


https://www.judiciary.uk/publications/steven-allen/


5 "Something we haven't accounted for has happened"

In record_urls[184] [284] [319] [515] [555] - now fixed

9 "Produced no data": they are just empty websites of 9 catgeories of deaths

In [None]:
#Any errors should print out above, but you can also check the error_catching dict
#Here we just turn it into a dataframe quickly to easily view

error_df = pd.DataFrame(error_catching)
error_df

Unnamed: 0,index,url,reason
0,3939,https://www.judiciary.uk/publications/railwayr...,No Text Loaded
1,3940,https://www.judiciary.uk/publications/service-...,No Text Loaded
2,3941,https://www.judiciary.uk/publications/product/,No Text Loaded
3,3942,https://www.judiciary.uk/publications/policere...,No Text Loaded
4,3943,https://www.judiciary.uk/publications/carehome...,No Text Loaded
5,3944,https://www.judiciary.uk/publications/statecus...,No Text Loaded
6,3947,https://www.judiciary.uk/publications/hospital...,No Text Loaded
7,4005,https://www.judiciary.uk/publications/drugsalc...,No Text Loaded
8,4006,https://www.judiciary.uk/publications/commheal...,No Text Loaded


This is my final step that puts the text data (info on the deceased/case) into a csv file & adds the date it was pulled

In [None]:
from datetime import date

headers = ['date_of_report', 'date_of_reports', 'ref', 'deceased_name', 'deceased_names', 'coroner_name', 'coroner_area', 'category', 'this_report_is_being_sent_to', 'these_report_are_being_sent_to', 'these_report_have_been_sent_to', 'url']

with open('death_info_{}.csv'.format(date.today()), 'w', newline='', encoding='utf-8') as deaths_csv:
    writer = csv.DictWriter(deaths_csv, fieldnames=headers)
    writer.writeheader()
    for record in record_text:
        if record == {}:
            pass
        else:
            writer.writerow(record)

In [None]:
df = pd.read_csv('death_info_2022-07-12.csv')
display(df.head())
print(df.shape)

Unnamed: 0,date_of_report,date_of_reports,ref,deceased_name,deceased_names,coroner_name,coroner_area,category,this_report_is_being_sent_to,these_report_are_being_sent_to,these_report_have_been_sent_to,url
0,26 May 2022,,2022-0155,Saifur Rahman,,James Bennett,Birmingham and Solihull,State Custody related deaths | Mental Health r...,Ministry of Justice and Birmingham and Solihul...,,,https://www.judiciary.uk/publications/saifur-r...
1,25 May 2022,,2022-0154,Raymond Gillespie,,Kate Sutherland,North Wales (East & Central),Emergency services related deaths (2019 onward...,Welsh Ambulance NHS Foundation Trust and Betsi...,,,https://www.judiciary.uk/publications/raymond-...
2,24 May 2022,,2022-0153,Michael Wysockyj,,Jacqueline Lake,Norfolk,Hospital Death (Clinical Procedures and medica...,Queen Elizabeth Hospital King’s Lynn NHS Found...,,,https://www.judiciary.uk/publications/michael-...
3,12 May 2022,,2022-0152,Pauline Keen,,Joanne Andrews,North East Kent,Hospital Death (Clinical Procedures and medica...,Kent and Medway NHS Social Care Partnership Tr...,,,https://www.judiciary.uk/publications/pauline-...
4,,19 May 2022,2022-0151,Sangeerth Girirathan,,Tom Osborne,Milton Keynes,Hospital Death (Clinical Procedures and medica...,,,Milton Keyne University Hospital and Secretary...,https://www.judiciary.uk/publications/sangeert...


(4001, 12)
