This repository has been archived by the owner on Jul 8, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
67 lines (54 loc) · 1.92 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import datetime
import json
import os
import re
import requests
from commitment import GitHubCredentials, GitHubClient
VALID_FORMATS = ("csv", )
credentials = GitHubCredentials(
repo="DemocracyClub/GDSRegisters",
name=os.environ['MORPH_GITHUB_USERNAME'],
email=os.environ['MORPH_GITHUB_EMAIL'],
api_key=os.environ['MORPH_GITHUB_API_KEY']
)
client = GitHubClient(credentials)
def make_url(name, domain=None, data_format="csv"):
assert data_format in VALID_FORMATS
if not domain:
domain = "www.registers.service.gov.uk"
URL_FMT = "https://{domain}/registers/{name}/download-{format}"
return URL_FMT.format(
domain=domain,
name=name,
format=data_format
)
def get_all_register_names():
all_names = set()
# Check registers listed in the register register
req = requests.get(make_url("register"))
for register in req.text.splitlines():
register_name = register.split(",")[3].strip('"')
all_names.add(register_name)
# Check "upcoming" registers
req = requests.get(
"https://www.registers.service.gov.uk/registers-in-progress"
)
for name in re.findall("/registers/([^\"\' ]+)", req.text):
all_names.add(name)
return list(all_names)
def save_register_data(register_name):
dir_path = "registers/{}/".format(register_name)
os.makedirs(dir_path, exist_ok=True)
for data_format in VALID_FORMATS:
url = make_url(register_name, data_format=data_format)
file_name = "{}.{}".format(register_name, data_format)
dir_and_file = os.path.join(dir_path, file_name)
req = requests.get(url)
if req.status_code == 200:
content = req.text
client.push_file(content, dir_and_file, "Updated on {}".format(
datetime.datetime.now().isoformat()
))
if __name__ == "__main__":
for name in get_all_register_names():
save_register_data(name)