In [1]:
from bs4 import BeautifulSoup
from dataclasses import dataclass, field
import pandas as pd
import requests
from tqdm.notebook import tqdm
from typing import List

In [2]:
def get_soup(url: str):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [3]:
def get_cases(soup): 
    cases_list = soup.find('ul', class_='gem-c-document-list')
    if cases_list:
        for li in cases_list.find_all('li'):
            a_tag = li.find('a')
            if a_tag:
                title_and_url = {
                    "title": a_tag.get_text(strip=True),
                    "link": f"https://www.gov.uk{a_tag['href']}"
                }
                # cases defined in script/loop
                cases.append(title_and_url)
    pass

In [4]:
@dataclass
class Case:
    url: str = ""
    files: List[str] = field(default_factory=list)
    meta: dict = field(default_factory=dict)

    def __post_init__(self):
        soup = self._get_soup()
        self._extract_attributes(soup)
        self._extract_files(soup)

    def _get_soup(self) -> BeautifulSoup:
        response = requests.get(self.url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup
    
    def _extract_files(self, soup) -> None:
        pdf_links = []
        for a_tag in soup.find_all('a', href=True):
            if a_tag['href'].endswith('.pdf'):
                title_and_url = {
                        "title": a_tag.get_text(strip=True),
                        "link": a_tag['href']
                    }
                pdf_links.append(title_and_url)
        self.files = pdf_links

    def _extract_attributes(self, soup) -> None:
        dl_element = soup.find('dl', class_='app-c-important-metadata__list')
        metadata_dict = {}
        if dl_element:
            dt_elements = dl_element.find_all('dt', class_='app-c-important-metadata__term')
            dd_elements = dl_element.find_all('dd', class_='app-c-important-metadata__definition')
            for dt, dd in zip(dt_elements, dd_elements):
                key = dt.get_text(strip=True).rstrip(':').lower().replace(' ', '_')
                value = dd.get_text(strip=True)
                metadata_dict[key] = value
        self.meta = metadata_dict

## Parameters

In [5]:
url = "https://www.gov.uk/cma-cases?case_state%5B%5D=closed&case_type%5B%5D=mergers"

## Get number of pages

In [6]:
soup = get_soup(url)

In [7]:
span_element = soup.find('span', class_='govuk-pagination__link-label')
if span_element:
    text = span_element.get_text(strip=True)
    parts = text.split(' of ')
    if len(parts) == 2:
        num_pages = int(parts[1])
    else:
        raise("Text format is not as expected.")
num_pages

38

## Get cases

In [8]:
cases = []

### Page 1

In [9]:
get_cases(soup)

### Other pages

In [10]:
for i in range(2, num_pages + 1):
    url_i = f"{url}&page={i}"
    soup = get_soup(url_i)
    get_cases(soup)

In [11]:
len(cases)

1900

In [12]:
for index, item in enumerate(cases):
    item["id"] = index

## Get case info

In [13]:
files = []

for c in tqdm(cases):
    # print(c)
    case = Case(c['link'])
    cases[c["id"]].update(case.meta)
    for f in case.files:
        f["id"] = c["id"]
        files.append(f)

  0%|          | 0/1900 [00:00<?, ?it/s]

In [14]:
df_cases = pd.DataFrame(cases)
df_cases.head()

Unnamed: 0,title,link,id,case_type,case_state,market_sector,opened,closed,outcome
0,Scanpole / C&G merger inquiry,https://www.gov.uk/cma-cases/scanpole-slash-c-...,0,Mergers,Closed,Building and construction,25 June 2024,14 August 2024,
1,Eurofins / Cellmark merger inquiry,https://www.gov.uk/cma-cases/eurofins-slash-ce...,1,Mergers,Closed,"Distribution and service industries,Fire, poli...",18 July 2024,2 August 2024,Mergers - phase 1 clearance
2,Thermo Fisher Scientific / Olink merger inquiry,https://www.gov.uk/cma-cases/thermo-fisher-sci...,2,Mergers,Closed,Healthcare and medical equipment,2 February 2024,8 July 2024,Mergers - phase 1 clearance
3,AlphaTheta / Serato merger inquiry,https://www.gov.uk/cma-cases/alphatheta-slash-...,3,Mergers,Closed,Recreation and leisure,23 January 2024,22 July 2024,Mergers - phase 2 cancellation
4,TGS/PGS UK merger inquiry,https://www.gov.uk/cma-cases/tgs-slash-pgs-uk-...,4,Mergers,Closed,Oil and gas refining and petrochemicals,12 April 2024,9 July 2024,


In [15]:
len(df_cases)

1900

In [16]:
df_files = pd.DataFrame(files)
df_files.head()

Unnamed: 0,title,link,id
0,"Commencement notice (PDF, 89KB)",https://assets.publishing.service.gov.uk/media...,0
1,"Commencement notice (PDF, 80KB)",https://assets.publishing.service.gov.uk/media...,1
2,"Full text decision (PDF, 469KB)",https://assets.publishing.service.gov.uk/media...,2
3,"Commencement notice (PDF, 90KB)",https://assets.publishing.service.gov.uk/media...,2
4,"Administrative timetable (PDF, 121KB)",https://assets.publishing.service.gov.uk/media...,3


In [17]:
len(df_files)

7562

## Save data

In [18]:
df_cases.to_parquet("../data/cases.parquet")

In [19]:
df_files.to_parquet("../data/files.parquet")