# Gov.uk DEFRA Actions scraper

We want to recover the full text of all the available actions on the DEFRA finder, so that we can analyse them and build up a taxonomy to help us derive a data model.

In [21]:
gov_base_url = "https://www.gov.uk"
finder_base_url = "https://www.gov.uk/find-funding-for-land-or-farms"
page2 = "?page=2"
page3 = "?page=3"

There are threee pages of links so rather than spend time building a clever scraper we'll extract the links from all three pages, save them to files, manually trim the unnecessary ones and then concatenate. Then we can run through that list and pull the text of each page.

In [109]:
import requests
from bs4 import BeautifulSoup

def extract_hyperlinks(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all anchor tags
    anchor_tags = soup.find_all('a')

    # Extract the href attribute from each anchor tag
    hyperlinks = [a.get('href') for a in anchor_tags if a.get('href')]

    return hyperlinks


Now we'll get the three pages of links.

In [None]:
links_page1 = extract_hyperlinks(finder_base_url)

with open('output/actions_links_page1.txt', 'w') as f:
    for link in links_page1:
        f.write(f"{link}\n")

In [None]:
links_page2 = extract_hyperlinks(finder_base_url+page2)

with open('output/actions_links_page2.txt', 'w') as f:
    for link in links_page2:
        f.write(f"{link}\n")

In [None]:
links_page3 = extract_hyperlinks(finder_base_url+page3)

with open('output/actions_links_page3.txt', 'w') as f:
    for link in links_page3:
        f.write(f"{link}\n")

At this point we edit the files manually. Do that and then come back here : )

In [None]:
# read in the three files

with open('output/actions_links_page1.txt') as file:
    links1 = file.readlines()
with open('output/actions_links_page2.txt') as file:
    links2 = file.readlines()
with open('output/actions_links_page3.txt') as file:
    links3 = file.readlines()

# concatenate, stripping newlines

all_links_relative = [link.rstrip() for link in links1 + links2 + links3]

Now we need to iterate the list of links and retrieve each page. We'll store them in individual files and, if it looks easy enough, extract the codes and use them as keys in a dictionary to index into filenames

NB: Currently the extraction code is stripping out the tables. We don't think we need them.

In [26]:
def get_text(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract text from the headers and paragraphs
        headers = ' '.join([h.get_text() for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
        paragraphs = ' '.join([p.get_text() for p in soup.find_all('p')])

        # Combine headers and paragraphs text
        text = headers + ' ' + paragraphs

        return text

    except requests.exceptions.HTTPError as errh:
        print("HTTP Error:", errh)
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:", errc)
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:", errt)
    except requests.exceptions.RequestException as err:
        print("Oops! Something went wrong:", err)

In [103]:
def extract_good_stuff(text):
    """Trim common header and footer text from the scraped text."""

    t2 = text.split('What you must do to get paid for this action and advice on how to do it.')[-1]
    t3 = t2.split('To help us improve GOV.UK')[0]
    return t3.strip()

text = get_text(gov_base_url + all_links_relative[0])
t2 = extract_good_stuff(text)
t2

'This is an action in the Sustainable Farming Incentive (SFI) scheme: expanded offer for 2024. You must read the SFI scheme information to understand the scheme rules and how to apply. 3 years £4 per 100m for both sides This action’s aim is that there are ditches with: The purpose of this is to: An eligible ditch for this action must: You can only do this action on both sides of the entire length of an eligible ditch. This means you must have management control of both sides of the ditch. You can choose what length to enter. You can enter both sides of a ditch that borders a neighbour’s land if you meet both of the following conditions: You cannot do this action on ditches managed by third parties, such as Internal Drainage Boards. The following features are not eligible for this action: Not applicable, as this is a linear action. This action is static. This means you must do it at the same location each year of this action’s duration. You must manage the ditch in a way that can reason

Now we're ready to iterate the links in the list, retrieve the document, clean it up and write it to file.

In [104]:
# def get_filename(relative_link):
#     """Derive a filename from the helpful url."""
#     return relative_link.split('/')[-1] + '.txt'

# def get_code(relative_link):
#     """Recover the action's code from the url also"""
#     output_filename = get_filename(relative_link)
#     return output_filename.split('-')[0].upper()

from utils import get_code, get_filename

def scrape(relative_link, outputdir='output/actions/'):
    gov_base_url = "https://www.gov.uk"
    output_filename = get_filename(relative_link)
    code = get_code(relative_link)
    text = get_text(gov_base_url + relative_link)
    trimmed_text = extract_good_stuff(text)
    # write the file out
    with open(outputdir + output_filename, 'w') as f:
        f.write(trimmed_text)
    return(code, output_filename, trimmed_text)

# test the above string extractions
# for link in all_links_relative:
#     print(f"{get_code(link)}: {get_filename(link)}")


In [105]:
codes_to_filenames = {}
codes_to_text = {}

for relative_link in all_links_relative:
    # get the document contents
    (code, filename, text) = scrape(relative_link)
    codes_to_filenames[code] = filename
    codes_to_text[code] = text
    print(f"Retrieved {get_code(relative_link)}: {get_filename(relative_link)}")


Retrieved WBD2: wbd2-manage-ditches.txt
Retrieved WBD1: wbd1-manage-ponds.txt
Retrieved OFA3: ofa3-supplementary-winter-bird-food-organic-land.txt
Retrieved AGF1: agf1-maintain-very-low-density-in-field-agroforestry-on-less-sensitive-land.txt
Retrieved AGF2: agf2-maintain-low-density-in-field-agroforestry-on-less-sensitive-land.txt
Retrieved CHRW1: chrw1-assess-and-record-hedgerow-condition.txt
Retrieved CHRW2: chrw2-manage-hedgerows.txt
Retrieved CHRW3: chrw3-maintain-or-establish-hedgerow-trees.txt
Retrieved BND1: bnd1-maintain-dry-stone-walls.txt
Retrieved BND2: bnd2-maintain-earth-banks-or-stone-faced-hedgebanks.txt
Retrieved CAHL4: cahl4-4m-to-12m-grass-buffer-strip-on-arable-and-horticultural-land.txt
Retrieved CIGL3: cigl3-4m-to-12m-grass-buffer-strip-on-improved-grassland.txt
Retrieved BFS1: bfs1-12m-to-24m-watercourse-buffer-strip-on-cultivated-land.txt
Retrieved BFS2: bfs2-buffer-in-field-ponds-on-arable-land.txt
Retrieved BFS3: bfs3-buffer-in-field-ponds-on-improved-grasslan

In [106]:
codes_to_text['AHW2']

'This is an action in the Sustainable Farming Incentive (SFI) scheme: expanded offer for 2024. You must read the SFI scheme information to understand the scheme rules and how to apply. 3 years £732 per tonne per year – maximum of 1 tonne of supplementary winter bird food (action AHW2) for every 2 hectares (ha) of winter bird food on arable and horticultural land (action CAHL2). This action’s aim is that over the winter until mid-spring there’s a mix of seeds spread on the ground at multiple feeding areas which are: The purpose of this is to provide seed-eating farmland birds with supplementary food when: For winter supplementary feeding, you can only apply for this action if CAHL2 (winter bird food) is included in your agreement. You must spread a winter supplementary feeding mix in a way that can reasonably be expected to achieve this action’s aim, including: You must use a winter supplementary feeding seed mix that contains both: The small non-cereal seeds element must contain at lea

Note: The text is pretty clean but the most useful part is all in the section titled 'What you must do to get paid for this action and advice on how to do it.'

Consider further cleaning the text before NER.

## Named Entity Resolution

This will make a call to OpenAI, so you'll need an API key for that. Create a file called 'openai.yaml' with the contents 

`openai_key: your_key_here`

(You may need to quote the key if it starts with a leading zero).

That file is excluded from git in the .gitignore file, but be careful anyway : )

In [107]:
from openai import OpenAI
import os
import yaml

# Set up your OpenAI API key
with open('openai.yaml', 'r') as f:
    config = yaml.safe_load(f)


def gpt4_ner(prompt, text):
    client = OpenAI(
        # This is the default and can be omitted
        api_key=config['openai_key'],
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt + '\n\n' + text,
            }
        ],
        model="gpt-3.5-turbo",
    )

    result = chat_completion.choices[0].message.content

    return result

In [119]:
prompt = open('prompts/chatgpt4_NER.txt').read()
text = codes_to_text['SOH1']

# Perform NER
result = gpt4_ner(prompt, text)

print(result)

{
    "Key Features and Concepts": ["Sustainable Farming Incentive", "SFI", "Scheme", "Expanded Offer", "2024", "Rules", "Apply", "3 years", "£73", "Hectare", "No-tillage farming techniques", "Soil disturbance minimization", "Agricultural land", "Moorland line", "Strip tillage", "Land Parcel", "Static Location", "No-till direct drill", "Broadcasting equipment", "Cash crops", "Cover crops", "Temporary grassland", "Cash crop definition", "Direct drills types", "Broadcasting equipment requirements", "Conventional cultivation machinery prohibition", "Low disturbance subsoiling", "Soil compaction reduction", "Crop rotation", "Evidence retention", "Crop at action start", "Requirements", "Advice", "Documentation", "Evidence", "Compliance", "Boundaries", "Applications", "Summer", "Personal information", "Financial information", "National Insurance number", "Credit Card Details"],
    "Payments": "£73 per hectare (ha) per year",
    "Duration": "3 years",
    "Eligibility Rules": ["The action m

In [120]:
import json

# test the output is valid JSON

json.loads(result)

{'Key Features and Concepts': ['Sustainable Farming Incentive',
  'SFI',
  'Scheme',
  'Expanded Offer',
  '2024',
  'Rules',
  'Apply',
  '3 years',
  '£73',
  'Hectare',
  'No-tillage farming techniques',
  'Soil disturbance minimization',
  'Agricultural land',
  'Moorland line',
  'Strip tillage',
  'Land Parcel',
  'Static Location',
  'No-till direct drill',
  'Broadcasting equipment',
  'Cash crops',
  'Cover crops',
  'Temporary grassland',
  'Cash crop definition',
  'Direct drills types',
  'Broadcasting equipment requirements',
  'Conventional cultivation machinery prohibition',
  'Low disturbance subsoiling',
  'Soil compaction reduction',
  'Crop rotation',
  'Evidence retention',
  'Crop at action start',
  'Requirements',
  'Advice',
  'Documentation',
  'Evidence',
  'Compliance',
  'Boundaries',
  'Applications',
  'Summer',
  'Personal information',
  'Financial information',
  'National Insurance number',
  'Credit Card Details'],
 'Payments': '£73 per hectare (ha) p

TODO:

- collect all the eligibility rules from the corpus of actions
- choose a collection of rules - say 10 - that we can represent in a machine-readable form
- 

Now let's do them all

We'll collect the results in a dict alongside the code, that way we can save the whole lot out to a json document

In [141]:
llm_results = {}

prompt = open('prompts/chatgpt4_NER.txt').read()

for code, text in codes_to_text.items():
    print(f"Doing {code}")
    result = gpt4_ner(prompt, text)
    # we want to make sure it's valid json so we deserialise it and then serialise it again
    # if this fails we store the string as text instead to be fixed manually
    try:
        with open(f"output/results/{code}.json", 'w') as f:
            f.write(json.dumps(json.loads(result)))
    except Exception as e:
        print(e)
        with open(f"output/results/{code}.txt", 'w') as f:
            f.write(result)

Doing WBD2
Doing WBD1
Doing OFA3
Doing AGF1
Doing AGF2
Doing CHRW1
Doing CHRW2
Doing CHRW3
Doing BND1
Doing BND2
Doing CAHL4
Doing CIGL3
Doing BFS1
Doing BFS2
Doing BFS3
Doing BFS4
Doing BFS5
Doing BFS6
Doing CAHL1
Doing CAHL2
Expecting property name enclosed in double quotes: line 16 column 1 (char 1773)
Doing CAHL3
Doing AHW1
Doing AHW2
Doing AHW3
Doing AHW4
Doing AHW5
Doing AHW6
Doing AHW7
Doing AHW8
Doing AHW9
Doing AHW10
Doing AHW11
Doing AHW12
Doing CIGL1
Doing CIGL2
Doing CLIG3
Doing GRH1
Doing GRH7
Doing GRH8
Doing GRH10
Doing GRH11
Doing SCR1
Doing SCR2
Doing HEF1
Doing HEF2
Doing HEF5
Doing HEF6
Doing HEF8
Doing CIPM1
Doing CIPM2
Doing CIPM3
Doing CIPM4
Doing CMOR1
Doing UPL1
Doing UPL2
Doing UPL3
Doing UPL4
Doing UPL5
Expecting property name enclosed in double quotes: line 12 column 1 (char 982)
Doing UPL6
Doing UPL7
Doing UPL8
Doing UPL9
Doing UPL10
Doing CNUM1
Doing CNUM2
Doing CNUM3
Doing OFC1
Doing OFC2
Doing OFC3
Doing OFC4
Doing OFC5
Doing OFM1
Doing OFM2
Doing OFM3
Do

In [147]:
# fix the broken ones
# in most cases you can copy-paste the text in from the .txt file, it should then parse as a dict

text = {
  "Key Features and Concepts": ["Sustainable Farming Incentive",
    "SFI", "Scheme", "Expanded Offer", "2024", "Scheme Rules", "Apply", "Supplemental Action", "Grazing Livestock Units (GLU)", "Moorland", "Cattle", "Ponies", "Eligible Land", "Base Actions", "UPL1", "UPL2", "UPL3", "Stocking Calendar", "Duration", "Advice", "Applications", "Summer 2024", "Requirements", "Written Evidence"],
  "Payments": "£18 per hectare (ha) per year",
  "Duration": "3 years",
  "Eligibility Rules": ["Supplemental action can only be done on eligible land entered into specific base actions.",
    "At least 70% of GLU on UPL1, UPL2, or UPL3 must be cattle or ponies.",
    "Written stocking calendar required for UPL1, UPL2, and UPL3.",
    "Action must be conducted throughout each year of its 3-year duration.",
    "Documentation and evidence of stocking calendar must be kept and provided if requested.",
    "Various actions or options can be done on the same area in a land parcel."],
}
with open('output/results/UPL5.json', 'w') as f:
    f.write(json.dumps(text))

# Read them all back in

So we can aggregate up the rules etc.

In [154]:
# collect all the eligibility rules

eligibility_rules = {}

for code in codes_to_text.keys():
    res = json.loads(open(f'output/results/{code}.json').read())
    eligibility_rules[code] = res.get('Eligibility Rules', [])

In [161]:
# this might not work but let's try and get a dedpued list of these
all_rules = [rule for key, rulelist in eligibility_rules.items() for rule in rulelist]

In [164]:
all_rules.sort()

In [167]:
len(set(all_rules))

687

## Key Concepts

The rules list is informative, but because we used the LLM to summarise them they aren't written the same way each time. I suspect this means that rules that would otherwise use the exact same language don't match in this case. Given that the HTML has standard formatting it's probably a better idea to scrape them directly from the tables.

So let's do the same but for key concepts, which we can't get from the HTML in a structured manner.

In [168]:
# collect all the key concepts

key_concepts = {}

for code in codes_to_text.keys():
    res = json.loads(open(f'output/results/{code}.json').read())
    key_concepts[code] = res.get('Key Features and Concepts', [])

In [175]:
all_concepts = [kc.lower() for key, kcs in key_concepts.items() for kc in kcs]
len(all_concepts)

2944

In [176]:
len(set(all_concepts))

689

In [178]:
# so there are 3k concepts altogether and 739 uniques. Let's count up the numbers!

from collections import Counter

kc_counts = Counter(all_concepts)
kc_counts

Counter({'sustainable farming incentive': 101,
         'sfi': 101,
         'scheme': 101,
         'expanded offer': 101,
         '2024': 99,
         'rules': 91,
         'apply': 91,
         'advice': 90,
         'applications': 83,
         'personal information': 75,
         'financial information': 75,
         'national insurance number': 74,
         'credit card details': 74,
         '3 years': 71,
         'hectare': 68,
         'documentation': 63,
         'summer': 63,
         'moorland line': 57,
         'agricultural land': 57,
         'evidence': 54,
         'static location': 46,
         'requirements': 38,
         'boundaries': 34,
         'land parcel': 32,
         'prohibited actions': 26,
         'rotational': 20,
         'static': 19,
         'duration': 14,
         'eligible land': 12,
         'supplemental action': 10,
         'grazing livestock units (glu)': 10,
         'soil erosion': 9,
         'soil management plan': 9,
         'seed

In [181]:
# now if we list them all and save to a file we can manually remove the stopwords
#list(kc_counts.keys())

In [184]:
# read back in the cleaned up list of concepts

concepts = open('output/key_concepts.txt').readlines()
concepts = [c.rstrip() for c in concepts]

In [188]:
# now we can recount
trimmed_concepts = [c for c in all_concepts if c in concepts]
new_counts = Counter(trimmed_concepts)
new_counts

Counter({'hectare': 68,
         'documentation': 63,
         'summer': 63,
         'moorland line': 57,
         'agricultural land': 57,
         'evidence': 54,
         'static location': 46,
         'requirements': 38,
         'boundaries': 34,
         'land parcel': 32,
         'prohibited actions': 26,
         'rotational': 20,
         'static': 19,
         'duration': 14,
         'eligible land': 12,
         'supplemental action': 10,
         'grazing livestock units (glu)': 10,
         'soil erosion': 9,
         'soil management plan': 9,
         'seed mix': 9,
         'organic management': 9,
         'maintenance': 8,
         'surface runoff': 8,
         'limited area': 8,
         'moorland': 8,
         'supplementary feeding': 7,
         'cash crop': 7,
         'management control': 6,
         'companion crop': 6,
         'maintain': 6,
         'livestock grazing': 6,
         'mineral licks': 6,
         'stocking calendar': 6,
         'grazing un

In [200]:
# let's get that into a spreadsheet for the spreadsheet likers

import pandas

df = pandas.DataFrame(new_counts.items(), columns=['Concept', 'Occurrences']).sort_values(by=['Occurrences'], ascending=False)
df

Unnamed: 0,Concept,Occurrences
22,hectare,68
26,summer,63
14,documentation,63
97,agricultural land,57
49,moorland line,57
...,...,...
234,blocks,1
233,grassland field corners,1
232,woodland strip,1
231,small mammals,1


In [201]:
df.to_csv('output/key_concepts_counts.csv', index=False)