In [1]:
import requests
import urllib.request
import time
from pathlib import Path
from bs4 import BeautifulSoup
import re
import csv
from datetime import datetime, timedelta

### Functions


In [2]:
# open a file and read the last line, so we can calculate changing values 
def get_last_line (filename):
    data = []
    data_directory = Path("data/processed/")
    file_to_open = data_directory / filename

    with open(file_to_open) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            data.append(row)
    return data[-1]


In [3]:
# open a file for writing and add a line to the end of it
def write_to_file(filename, values):
    #print (filename)
    #print (values)

    #hack to deal with empty strings
    out_list = []
    for x in values:
        if x == " ":
            out_list.append("")
        else:
            out_list.append (x)
    
    data_directory = Path("data/processed/")
    file_to_write = data_directory / filename
    
    with open(file_to_write, 'a') as f:
        writer = csv.writer(f)
        #writer.writerow(values)
        writer.writerow(out_list)
    

### Get the current date and yesterday as strings

In [4]:
#set two global variables for later (re)use
now = datetime.now()
cur_dat = now.strftime("%d-%b-%Y")

#today_date = datetime.strptime(cur_dat, '%d-%b-%Y')
yd = datetime.today() - timedelta(days=1)
yest_dat = yd.strftime("%d-%b-%Y")


### Get the external data for today
Get the text of our target page

In [5]:
url = 'https://www.gov.scot/publications/coronavirus-covid-19-tests-and-cases-in-scotland/'
response = requests.get(url)

Where will we store the file, giving it today's date

In [6]:
data_folder = Path("data/webpages/")

filename = cur_dat + ".html"
file_to_open = data_folder / filename


Write the file for safe keeping

In [7]:
with open(file_to_open, 'w') as my_data_file:
    my_data_file.write(response.text)

### Scraping the content of what we've grabbed 
Now we use Beautiful Soup to hunt through the text of the source webpage (still in memory)

In [8]:
soup = BeautifulSoup(response.text, 'html.parser')

In [9]:
results = soup.find(id='preamble')

In [10]:
# Uncomment the next line if you want to see the HTML of the section we are interested in

#print(results.prettify())

### Find all the non-tabular text entries for data

In [11]:
mybody = results.find("div", {"class": "body-content publication-body"})
#print (mybody)
first_txt = mybody.find('p')
subject = str(first_txt)

#find the total number tested
total_tested = int("".join (re.findall(r"([\d*])",subject)))

#Now find all the pieces of text buried in lists
firstH3 = mybody.find('h3') # Start here
uls = []

for nextSibling in firstH3.findNextSiblings():
    if nextSibling.name == 'h2':
        break
    if nextSibling.name == 'ul':
        uls.append(nextSibling)
        
lis = []

for ul in uls:
    for li in ul.findAll('li'):
        if li.find('ul'):
            break
        lis.append(li)

total_neg = "".join (re.findall(r"([\d*])", lis[0].text))
total_pos = "".join (re.findall(r"([\d*])", lis[1].text))
#total_dec = "".join (re.findall(r"([\d*])", lis[2].text)) #broken on 19 Apr

#fix added 19 Apr
dec_list = re.findall('[0-9]+', lis[2].text)
tot_dec = dec_list[0]


icus = re.findall('[0-9]+', lis[3].text)
icu_tot = icus[0]
icu_pos = icus[1]

hosp = re.findall('[0-9]+', lis[4].text.replace(",",""))
hosp_tot = hosp[0]

calls = re.findall('[0-9]+', lis[5].text.replace(",",""))
one_tot = calls[0]
cv_tot = calls[2]

ambs = re.findall('[0-9]+', lis[6].text.replace(",",""))
sas_tot = ambs[0]
cv_amb_tot = ambs[1]
sas_uplifts = ambs[3]

#Give up on this for now - this figure is sometimes in, sometimes out of the list
#---------------------------------------------------
#staff_ab_list = re.findall('[0-9]+', lis[8].text.replace(",",""))
#staff_abs = staff_ab_list[1]


#### Uncomment this block to show values found. 

In [62]:
'''
print (f"Total Tested: {total_tested}")
print (f"Total negative tests: {total_neg}")
print (f"Total positive tests: {total_pos}")
print (f"Total deceased: {total_dec}")
print (f"Total in ICU: {icu_tot}")
print (f"ICU positives: {icu_pos}")
print (f"Hospitalised: {hosp_tot}")
print (f"Calls to 111: {one_tot}")
print (f"Calls to CV Hotline: {cv_tot}")
print (f"SAS Attendances: {sas_tot}")
print (f"SAS Attendances at suspected CV19: {cv_amb_tot}")
print (f"SAS CV19 uplifts: {sas_uplifts}")
#print (f"Staff absences: {staff_abs}")
'''


'\nprint (f"Total Tested: {total_tested}")\nprint (f"Total negative tests: {total_neg}")\nprint (f"Total positive tests: {total_pos}")\nprint (f"Total deceased: {total_dec}")\nprint (f"Total in ICU: {icu_tot}")\nprint (f"ICU positives: {icu_pos}")\nprint (f"Hospitalised: {hosp_tot}")\nprint (f"Calls to 111: {one_tot}")\nprint (f"Calls to CV Hotline: {cv_tot}")\nprint (f"SAS Attendances: {sas_tot}")\nprint (f"SAS Attendances at suspected CV19: {cv_amb_tot}")\nprint (f"SAS CV19 uplifts: {sas_uplifts}")\n#print (f"Staff absences: {staff_abs}")\n'

### Do the calculations and write all of the above to the right files

In [12]:
#What were the last data written to scot_tests.csv ?
cur_data = get_last_line ("scot_tests.csv")
#print(cur_data)

# prepare values to write
cur_tot_tests = int(cur_data[6])
cur_neg = int(cur_data[5])
cur_pos = int(cur_data[4])

new_tests = int(total_tested) -  cur_tot_tests
new_neg = int(total_neg) - cur_neg
new_pos = int(total_pos) - cur_pos
#print (new_tests,new_neg, new_pos)

#write values to our file scot_tests.csv
out_list = []
out_list.extend ([str(cur_dat), new_tests, new_pos, new_neg,total_pos, total_neg, total_tested])
write_to_file ("scot_tests.csv", out_list )

#===========================================

#write values to our file regional_deaths.csv
out_list = [cur_dat]
out_list.extend (["x","x","x","x","x","x","x","x","x","x","x","x","x","x"])
out_list.extend([total_dec])
write_to_file ("regional_deaths.csv", out_list )

#============================================
#write values to our file intensive_care.csv
out_list = [cur_dat]
out_list.extend ([icu_tot])
write_to_file ("intensive_care.csv", out_list )

#============================================
#write values to our file new_daily_cases.csv
out_list = [cur_dat]
out_list.extend ([new_pos])
write_to_file ("new_daily_cases.csv", out_list )

#============================================
#Update our file scot_test_positive_deceased
cur_data = get_last_line ("scot_test_positive_deceased.csv")
day_no = int(cur_data[4]) +1

#write values to our file scot_test_positive_deceased.csv
out_list = [cur_dat]
out_list.extend ([int(total_pos),int(total_dec),int(total_tested)])
out_list.extend ([day_no])
write_to_file ("scot_test_positive_deceased.csv", out_list )

NameError: name 'total_dec' is not defined

### Next find the data in tables


In [13]:
#create a list _tables_ of all tables we find (normally two)
tables = mybody.findAll("table")


### Health Board Cases

In [14]:
#Work on our first table (i.e. _tables[0]_)
hb_list = [] #a list for all health boards

for tr in tables[0].find_all('tr')[1:]:
    in_list = []
    tds = tr.find_all('td')
    #print (f" Cases: {tds[1].text.strip()}, PPl in H:{tds[2].text.strip()}, PPL in ICU: {tds[3].text.strip()}") 
    if re.search('[0-9]+', tds[0].text.strip()):
        in_list.append(re.findall('[0-9]+', tds[0].text.strip()))
    else:
        in_list.append("")
    
    if re.search('[0-9]+', tds[1].text.strip()):    
        in_list.append(re.findall('[0-9]+', tds[1].text.strip()))
    elif re.search('/*', tds[1].text.strip()):
        in_list.append(" ")
    else:
        in_list.append(" ")
    
    if re.search('[0-9]+', tds[2].text.strip()):
        in_list.append(re.findall('[0-9]+', tds[2].text.strip()))
    elif re.search('/*', tds[2].text.strip()):
        in_list.append(" ")
    else:
        in_list.append(" ")
    
    if re.search('[0-9]+', tds[3].text.strip()):
        in_list.append(re.findall('[0-9]+', tds[3].text.strip()))
    elif re.search('/*', tds[3].text.strip()):
        in_list.append(" ")
    else:
        in_list.append(" ")
        
    hb_list.append(in_list)

#print (hb_list)    

#Get data ready to update regional_cases.csv
out_list = [cur_dat]
for l in hb_list:
    out_list.extend (l[1])

out_list = out_list[:15] #dump Golden Jubilee, which was a late addition, as we don't use it
out_list.extend([total_pos])

# Write data to file
write_to_file ("regional_cases.csv", out_list)


### Staff Absences


### Hospitalisations and ICU cases

In [15]:
#We already have all of this in hb_list, created above. We've not used it yet. 

#Get data ready to update regional_hospitalisations.csv
out_list = [cur_dat]
for l in hb_list:
    out_list.extend (l[2])

out_list = out_list[:15] #dump Golden Jubilee as we don't use it
out_list.extend([hosp_tot])

# Write data to file
write_to_file ("regional_hospitalisations.csv", out_list)


#Get data ready to update regional_icu.csv
out_list = [cur_dat]
for l in hb_list:
    out_list.extend (l[3])

out_list = out_list[:15] #dump Golden Jubilee as we don't use it
out_list.extend([icu_tot])

# Write data to file
write_to_file ("regional_icu.csv", out_list)

In [16]:
#use the same methods as Health Boards above
absence_list = []
for tr in tables[1].find_all('tr')[1:]:
    in_list = []
    tds = tr.find_all('td')
    #print (f"Health Board: {tds[0].text.strip()},  Yesterday_count: {tds[3].text.strip()}") 
    if re.search('Nursing', tds[0].text.strip()):
        in_list.append('NMA')
    elif re.search('Medical', tds[0].text.strip()):
        in_list.append('MDSA')
    elif re.search('Other', tds[0].text.strip()):
        in_list.append('OSA')
    elif re.search('All', tds[0].text.strip()):
        in_list.append('ASA')
    in_list.append (tds[7].text.strip().replace(",",""))
    absence_list.append(in_list)
#print (absence_list)

########
# Write process to read and update file(s)
########

out_list = [yest_dat]
for inner in absence_list:
    out_list.extend([inner[1]])

# Write data to file
write_to_file ("staff_absences.csv", out_list)

IndexError: list index out of range

### To be done
Currently we dont capture the data elements below. 

We need to create a couple of data files, add the previous data, and then create write routines, as we do above, as we do for each days other data items.

In [None]:

# ICU positives: {icu_pos}
# Calls to 111: {one_tot}
# Calls to CV Hotline: {cv_tot}
# SAS Attendances: {sas_tot}
# SAS Attendances at suspected CV19: {cv_amb_tot}
# SAS CV19 uplifts: {sas_uplifts}
