<a href="https://colab.research.google.com/github/Danny2173/RAGproject/blob/main/1_Scraping_NHS_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Installs

In [None]:
%pip install -q playwright transformers tqdm faiss-cpu datasets nltk

!playwright install chromium


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Chromium 138.0.7204.23 (playwright build v1179)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1179/chromium-linux.zip[22m
[1G171.6 MiB [] 0% 0.0s[0K[1G171.6 MiB [] 0% 25.0s[0K[1G171.6 MiB [] 0% 9.7s[0K[1G171.6 MiB [] 0% 7.6s[0K[1G171.6 MiB [] 1% 6.6s[0K[1G171.6 MiB [] 1% 5.8s[0K[1G171.6 MiB [] 1% 5.1s[0K[1G171.6 MiB [] 2% 4.6s[0K[1G171.6 MiB [] 3% 4.1s[0K[1G171.6 MiB [] 3% 3.8s[0K[1G171.6 MiB [] 4% 3.5s[0K[1G171.6 MiB [] 5% 3.4s[0K[1G171.6 MiB [] 5% 3.3s[0K[1G171.6 MiB [] 5% 3.5s[0K[1G171.6 MiB [] 6% 3.5s[0K[1G171.6 MiB [] 6% 3.4s[0K[1G171.6 MiB [] 7% 3.3s[0K[1G171.6 MiB [] 8% 3.2s[0K[1G171.6 MiB [] 8% 3.1s[0K[1G171.6 MiB [] 9% 3.0s[0K[1G171.6 MiB [] 10% 3.0s

##Imports

In [None]:
import os
import json
import re
import asyncio
import hashlib
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
import nest_asyncio
from playwright.async_api import async_playwright

import numpy as np
import faiss
from tqdm import tqdm

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import torch
import torch.nn.functional as F

from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DPRContextEncoder, DPRContextEncoderTokenizer,
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer,
    RagTokenizer, RagRetriever, RagSequenceForGeneration
)

from sentence_transformers.util import cos_sim

from google.colab import drive
from collections import Counter, defaultdict


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##Scraping NHS Website for condition data

In [None]:
# Allowing async for synchronous running
nest_asyncio.apply()

# NHS website base url
base_url = "https://www.nhs.uk"

# Multiple links lead to the same page - removing these duplicates
def deduplicate_passages(passages):
    seen = set()
    unique = []
    for passage in passages:
        # Encoding the passage and creating the hash for easier access
        hash_value = hashlib.md5(passage["text"].strip().encode()).hexdigest()
        if hash_value not in seen:
            seen.add(hash_value)
            unique.append(passage)
    return unique
#####
def clean_text(text):
    # Data Stamp Removal
    text = re.sub(r"Page last reviewed: .*?Next review due: .*?(?=\n|$)", "", text, flags=re.DOTALL)
    # Table of Contents Removal
    text = re.sub(r"(?s)Chapters.*?(?=Previous|Next|$)", "", text)
    text = re.sub(r"(?s)This is a modal window\..*?(?=Previous|Next|$)", "", text)
    # Next/Previous Page Flag Removal
    text = re.sub(r"Previous\s*:.*(\n|$)", "", text)
    text = re.sub(r"Next\s*:.*(\n|$)", "", text)
    # Footer url Removal
    text = re.sub(r"\n{2,}[^\n]*?(https?://\S+)", "", text)
    return text.strip()
#####

# Extract all links from the conditions page
def get_all_condition_links():

    conditions_url = f"{base_url}/conditions/"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(conditions_url, headers=headers)
    # Building parser
    soup = BeautifulSoup(response.text, "html.parser")
    # Extracting relevant html blocks
    condition_lists = soup.find_all("ul", class_="nhsuk-list--links")

    condition_links = []
    # Extracting urls and titles from <li> -> <a> blocks
    for ul in condition_lists:
        for li in ul.find_all("li"):
            a = li.find("a")
            # Extracting condition links
            # if a and "/conditions/" in a["href"]:
            if a and a["href"].startswith("/conditions/"):
                full_url = urljoin(base_url, a["href"])
                title = a.get_text(" ", strip=True)
                condition_links.append({"title": title, "url": full_url})

    print(f"{len(condition_links)} condition pages found")
    return condition_links
#####

# Scraping condition mainpage and it's subpages
async def scrape_condition_with_subpages(entry):
    title = entry["title"]
    main_url = entry["url"]
    root_path = urlparse(main_url).path.rstrip("/") + "/"
    text_chunks = []
    visited = set()

    # Initiate async playwright session
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        # Scraping text
        async def scrape_text(url):
            try:
                await page.goto(url, timeout=90000)
                await page.wait_for_selector("main, article, div.nhsuk-card", timeout=15000)
                html = await page.content()
                soup = BeautifulSoup(html, "html.parser")

                # Selecting all content for full context
                content_elements = soup.select(
                    "article p, article li, article h1, article h2, article h3, article h4, article figcaption, "
                    "div.nhsuk-grid-column-two-thirds p, div.nhsuk-grid-column-two-thirds li, "
                    "div.nhsuk-grid-column-two-thirds h1, div.nhsuk-grid-column-two-thirds h2, "
                    "div.nhsuk-grid-column-two-thirds h3, div.nhsuk-grid-column-two-thirds h4, "
                    "div.nhsuk-grid-column-two-thirds figcaption, "
                    "div.nhsuk-grid-column-full p, div.nhsuk-grid-column-full li, "
                    "div.nhsuk-grid-column-full h1, div.nhsuk-grid-column-full h2, "
                    "div.nhsuk-grid-column-full h3, div.nhsuk-grid-column-full h4, "
                    "div.nhsuk-grid-column-full figcaption, "
                    "h1 span[role='text']"
                )

                result = []
                seen_lines = set()
                for el in content_elements:
                    # Adding element label
                    tag = el.name
                    # Avoiding contents list
                    if el.find_parent("nav", class_="nhsuk-contents-list"):
                        continue
                    # Extracting text and formatting
                    text = el.get_text(" ", strip=True)
                    text = text.replace('\u00a0', ' ').replace('–', '-').replace('—', '-')
                    # Adding a key to remove duplicate lines
                    key = re.sub(r'\s+', ' ', text.lower().strip())
                    if not text or key in seen_lines:
                        continue
                    seen_lines.add(key)

                    # Appending headline tags for context
                    if tag in ["h1", "h2", "h3", "h4", "figcaption"]:
                        result.append(f"{tag} {text}")
                    else:
                        result.append(text)

                # Finding potential sub-pages
                new_links = set()
                for a in soup.select("ul.nhsuk-hub-key-links a[href], nav.nhsuk-contents-list a[href], a.nhsuk-card__link[href]"):
                    href = a["href"]
                    full_url = urljoin(base_url, href)
                    if urlparse(full_url).path.startswith(root_path) and full_url != url:
                        new_links.add(full_url)

                # Storing review data information
                review_p = soup.select_one("p.nhsuk-body-s.nhsuk-u-margin-top-7.nhsuk-u-secondary-text-color")
                review_text = review_p.get_text(separator=" ", strip=True) if review_p else None

                # Joining and cleaning text
                full_text = "\n\n".join(result)
                cleaned_text = clean_text(full_text)
                return cleaned_text, {"review_info": review_text}, new_links

            # Stopblock for errors
            except Exception as e:
                print(f"Error for {url} - {e}")
                return "", {"review_info": None}, set()

        # Recursively visit subpages to collect all relevant content
        async def recursive_scrape(url):
            if url in visited:
                return
            visited.add(url)
            # print(f"Visiting subpage: {url}")
            text, meta, more_links = await scrape_text(url)
            if text:
                word_count = len(text.split())
                # Merge smaller chunks into the main page
                if text_chunks and word_count < 100:
                    print(f"Merged {url} into main page")
                    mainpage_lines = text_chunks[0]["text"].splitlines()
                    subpage_lines = text.splitlines()
                    # Start from second line if repeat
                    if subpage_lines and subpage_lines[0] in mainpage_lines:
                        subpage_lines = subpage_lines[1:]
                    text_chunks[0]["text"] += "\n\n" + "\n".join(subpage_lines)
                else:
                    text_chunks.append({"url": url, "text": text, "review_info": meta.get("review_info")})
            for link in more_links:
                await recursive_scrape(link)

        # print(f"Scraping: {title} ({main_url})")
        await recursive_scrape(main_url)
        await browser.close()

        if text_chunks:
            total_words = 0
            raw_texts = []

            # Storing first review_info from condition as back-up
            fallback_review_info = next(
                (chunk.get("review_info") for chunk in text_chunks if chunk.get("review_info")),
                None
            )


            for entry in text_chunks:
                url = entry["url"]
                passage = entry["text"].strip()
                word_count_text = len(passage.split())
                review_info = entry["review_info"]
                # Used stored review_info if none available
                if review_info is None and url == main_url:
                    review_info = fallback_review_info

                print(f"{url} - {word_count_text} words")
                # Add to total word count
                total_words += word_count_text
                raw_texts.append({
                    "text": passage,
                    "url": url,
                    "review_info": review_info
                })

            print(f"{title}: {total_words} words across {len(raw_texts)} pages.")
            return title, {
                "passages": deduplicate_passages(raw_texts),
                "metadata": {
                    "source": main_url,
                    "review_info": fallback_review_info
                }
            }

# Run multiple condition scrapes at the same time
async def scrape_all_conditions(max_parallel=5):
    all_links = get_all_condition_links()
    data_dict = {}

    semaphore = asyncio.Semaphore(max_parallel)

    async def safe_scrape(entry):
        async with semaphore:
            try:
                return await scrape_condition_with_subpages(entry)
            except Exception as e:
                print(f"Error for {entry['title']} - {e}")
                return None, None

    tasks = [safe_scrape(entry) for entry in all_links[:]]
    results = await asyncio.gather(*tasks)

    for key, content in results:
        if key and content:
            data_dict[key] = content

    print(f"Completed: {len(data_dict)} conditions.")
    return data_dict

# Run
all_condition_data = await scrape_all_conditions(max_parallel=5)

In [None]:
for passage in all_condition_data["Acne"]["passages"]:
    print("URL:", passage["url"])
    print("Review Info:", passage["review_info"])
    print(passage["text"][:])
    print("=" * 80)


URL: https://www.nhs.uk/conditions/acne/
Review Info: Page last reviewed: 03 January 2023 Next review due: 03 January 2026
h1 Overview - Acne

Acne is a common skin condition that affects most people at some point. It causes spots, oily skin and sometimes skin that's hot or painful to touch.

h2 Symptoms of acne

Acne most commonly develops on the:

face - this affects almost everyone with acne

back - this affects more than half of people with acne

chest - this affects about 15% of people with acne

h2 Types of spots

There are 6 main types of spot caused by acne:

blackheads - small black or yellowish bumps that develop on the skin; they're not filled with dirt, but are black because the inner lining of the hair follicle produces colour

whiteheads - have a similar appearance to blackheads, but may be firmer and will not empty when squeezed

papules - small red bumps that may feel tender or sore

pustules - similar to papules, but have a white tip in the centre, caused by a build-up

##Saving/Loading

In [None]:
# drive.mount('/content/drive')

# with open("/content/drive/MyDrive/nhs_conditions_data.json", "w", encoding="utf-8") as f:
#     json.dump(all_condition_data, f, ensure_ascii=False, indent=2)


In [None]:
drive.mount('/content/drive')

with open("/content/drive/MyDrive/nhs_conditions_data.json", "r", encoding="utf-8") as f:
    all_condition_data = json.load(f)


Mounted at /content/drive


##Splitting passages by word count, using headings for reference

In [None]:
def split_passages(data_dict, max_words=250, min_words=100):
    new_data_dict = {}

    for condition, condition_data in data_dict.items():
        passages = condition_data["passages"]
        final_chunks = []

        for passage_data in passages:
            passage = passage_data["text"]
            source_url = passage_data["url"]
            review_info = passage_data.get("review_info", None)

            words = passage.split()
            if len(words) <= max_words:
                final_chunks.append({"text": passage, "source": source_url, "review_info": review_info})
                continue

            lines = passage.splitlines()

            # Segmenting based on headings
            h1 = h2 = h3 = ""
            segments = []
            buffer = []

            def append_segment_if_valid():
                if buffer and any(line.strip() for line in buffer if not line.startswith(("h1 ", "h2 ", "h3 ", "h4 "))):
                    segments.append(("\n".join(buffer), h1, h2, h3))

            for line in lines:
                if line.startswith("h1 "):
                    append_segment_if_valid()
                    h1, h2, h3 = line, "", ""
                    buffer = [h1]
                elif line.startswith("h2 "):
                    append_segment_if_valid()
                    h2, h3 = line, ""
                    buffer = [h1, h2] if h1 else [h2]
                elif line.startswith("h3 "):
                    append_segment_if_valid()
                    h3 = line
                    buffer = [h1, h2, h3] if h2 else ([h1, h3] if h1 else [h3])
                elif line.startswith("h4 "):
                    append_segment_if_valid()
                    if h3:
                        buffer = [h1, h2, h3, line]
                    elif h2:
                        buffer = [h1, h2, line]
                    elif h1:
                        buffer = [h1, line]
                    else:
                        buffer = [line]
                else:
                    buffer.append(line)

            append_segment_if_valid()

            # Splitting segments > max_words
            def split_segments(segment, h1, h2, h3):
                # Storing the header_block for future use
                header_block = "\n".join(filter(None, [h1, h2, h3]))
                # Remove repeated header
                segment_lines = segment.splitlines()
                filtered_lines = [line for line in segment_lines if not line.startswith(("h1 ", "h2 ", "h3 ", "h4 "))]
                body_text = " ".join(filtered_lines).strip()

                wc = len(body_text.split())
                if wc <= max_words:
                    return [(f"{header_block}\n{body_text}").strip()]

                sentences = re.split(r'(?<=[.!?]) +', body_text)
                # At least two sentences to split
                if len(sentences) < 2:
                    return [(f"{header_block}\n{body_text}").strip()]

                # Finding mid point word count
                mid_point = sum(len(s.split()) for s in sentences) // 2
                cum_len, idx = 0, 0
                # Finding the index of the mid point
                for i, s in enumerate(sentences):
                    cum_len += len(s.split())
                    if cum_len >= mid_point:
                        idx = i
                        break
                # Splitting at the mid point index
                part1 = " ".join(sentences[:idx + 1]).strip()
                part2 = " ".join(sentences[idx + 1:]).strip()
                # Rebuilding structure
                if part1 == body_text or part2 == body_text or not part1 or not part2:
                    return [(f"{header_block}\n{body_text}").strip()]
                results = []
                if part1:
                    results.extend(split_segments(part1, h1, h2, h3))
                if part2:
                    results.extend(split_segments(part2, h1, h2, h3))
                return results

            # Process segments using the recursive splitter
            chunks = []
            for segment, h1, h2, h3 in segments:
                split_chunks = split_segments(segment, h1, h2, h3)
                chunks.extend(split_chunks)

            # Merging segments under same condition into chunks (greedy approach)
            merged_chunks = []
            chunk, count = [], 0
            for segment in chunks:
                wc = len(segment.split())
                if count + wc > max_words and chunk:
                    merged_chunks.append("\n\n".join(chunk))
                    chunk = [segment]
                    count = wc
                else:
                    chunk.append(segment)
                    count += wc
            if chunk:
                merged_chunks.append("\n\n".join(chunk))

            # Left-over chunks < min_words
            i = 0
            while i < len(merged_chunks):
                # For chunks < min_words
                wc = len(merged_chunks[i].split())
                if wc < min_words:
                    best_j = None
                    best_combined_wc = float('inf')
                    # Loop to find the best combination of smaller chunks
                    for j in range(len(merged_chunks)):
                        if i == j:
                            continue
                        combined_wc = wc + len(merged_chunks[j].split())
                        if combined_wc <= max_words and combined_wc < best_combined_wc:
                            best_j = j
                            best_combined_wc = combined_wc
                    # Retaining correct order
                    if best_j is not None:
                        if best_j < i:
                            merged_chunks[best_j] += "\n\n" + merged_chunks[i]
                        else:
                            merged_chunks[best_j] = merged_chunks[i] + "\n\n" + merged_chunks[best_j]
                        del merged_chunks[i]
                        # If j before i shift i back one
                        if best_j < i:
                            i -= 1
                    # Else move on
                    else:
                        i += 1
                else:
                    i += 1

            # Storing merged chunks into final chunks
            final_chunks.extend([
                {"text": chunk, "source": source_url, "review_info": review_info}
                for chunk in merged_chunks
            ])

        # Storing into a dictionary
        new_data_dict[condition] = {
            "chunks": final_chunks,
            "metadata": condition_data.get("metadata", {})
        }

    print(f"Split into chunks using headings")
    return new_data_dict

final_data = split_passages(all_condition_data, max_words=250, min_words=80)

Split into chunks using headings


In [None]:
# Checking > 300 word passages
overmax_count = 0
max_words = 300

for condition, condition_data in final_data.items():
    for i, chunk in enumerate(condition_data["chunks"]):
        wc = len(chunk["text"].split())
        if wc > max_words:
            overmax_count += 1
            print(f"{condition} over the max word count, word count: {wc}")
            print(chunk["text"])
            print("-" * 100)

print(f"Total over max word count : {overmax_count}")


Noonan syndrome over the max word count, word count: 304
h1 Characteristics - Noonan syndrome
h2 Other characteristics
Other less common characteristics of Noonan syndrome can include:  learning disability - children with Noonan syndrome tend to have a slightly lower-than-average IQ and a small number have learning disabilities , although these are often mild  feeding problems - babies with Noonan syndrome may have problems sucking and chewing, and may vomit soon after eating  behavioural problems - some children with Noonan syndrome may be fussy eaters, behave immaturely compared to children of a similar age, have problems with attention and have difficulty recognising or describing their or other people's emotions  increased bruising or bleeding - sometimes the blood doesn't clot properly, which can make children with Noonan syndrome more vulnerable to bruising and heavy bleeding from cuts or medical procedures  eye conditions - including a squint (where the eyes point in different d

##Second check for passages < min_words (Passages not passed through heading split code)

In [None]:
def merge_too_short_chunks(data_dict, merge_below=80, target_below=245, max_words=300):
    updated_data = {}

    for condition, condition_data in data_dict.items():
        chunks = condition_data["chunks"]
        short_chunks = []
        valid_chunks = []

        # Identifying short chunks
        for chunk in chunks:
            wc = len(chunk["text"].split())
            if wc < merge_below:
                short_chunks.append(chunk)
            else:
                valid_chunks.append(chunk)

        # Try to merge each short chunk into a suitable valid chunk
        for short in short_chunks:
            short_wc = len(short["text"].split())
            best_fit = None
            best_fit_index = -1
            min_gap = float("inf")

            for i, candidate in enumerate(valid_chunks):
                candidate_wc = len(candidate["text"].split())
                # Checking potential merge word count
                total_wc = candidate_wc + short_wc
                if candidate_wc < target_below and total_wc <= max_words:
                    gap = target_below - candidate_wc
                    # Finding potential valid chunk that is closest to target number
                    if gap < min_gap:
                        min_gap = gap
                        best_fit = candidate
                        best_fit_index = i

            if best_fit is not None:
                # Merge chosen candidate valid chunk with short chunk
                merged_text = best_fit["text"].strip() + "\n\n" + short["text"].strip()
                valid_chunks[best_fit_index]["text"] = merged_text
            else:
                valid_chunks.append(short)

        updated_data[condition] = {
            "chunks": valid_chunks,
            "metadata": condition_data.get("metadata", {})
        }

    print("Merging Complete")
    return updated_data


final_data = merge_too_short_chunks(final_data, merge_below=80, target_below=245, max_words=300)


Merging Complete


In [None]:
# Checking < 80 word passages
undermin_count = 0
min_words = 80

for condition, condition_data in final_data.items():
    for i, chunk in enumerate(condition_data["chunks"]):
        wc = len(chunk["text"].split())
        if wc < min_words:
            undermin_count += 1
            print(f"{condition} under the min word count, word count: {wc}")
            print(chunk["text"])
            print("-" * 100)

print(f"Total over max word count : {undermin_count}")


Bedbugs under the min word count, word count: 75
h1 Bedbugs
h2 How to get rid of bedbugs
h3 Don’t
do not keep clutter around your bed  do not bring secondhand furniture indoors without carefully checking it first  do not take luggage or clothing indoors without checking it carefully if you have come from somewhere where you know there were bedbugs

h1 Bedbugs
h2 How to get rid of bedbugs
h3 Find your local council
You can find your local council on GOV.UK .
----------------------------------------------------------------------------------------------------
Total over max word count : 1


In [None]:
# Checks

# 1)
# # List all condition names with their index
# for i, name in enumerate(final_data.keys()):
#     print(f"{i}: {name}")

# 2)
# condition = "Liver cancer"  # Preview condition

# if condition in final_data:
#     data = final_data[condition]
#     chunks = data["chunks"]
#     metadata = data["metadata"]

#     print(f"{condition} — {len(chunks)} chunks")
#     print(f"Primary Source: {metadata.get('source', 'None')}")
#     print(f"Review Info:{metadata.get('review_info', 'None')}")

#     for i, chunk in enumerate(chunks):
#         text = chunk["text"]
#         source = chunk["source"]
#         wc = len(text.split())
#         print(f"Chunk {i+1} {wc} words")
#         print(f"Source: {source}")
#         print(text)

# # 3)
# # Check first entry
# if all_condition_data:
#     first_key = list(all_condition_data.keys())[0]
#     print(f"title: {first_key}")
#     print(all_condition_data[first_key])


title: AAA, see Abdominal aortic aneurysm
{'passages': [{'text': "h1 Abdominal aortic aneurysm\n\nAn abdominal aortic aneurysm (AAA) is a swelling in the aorta, the artery that carries blood from the heart to the tummy (abdomen). Most aneurysms do not cause any problems, but they can be serious because there's a risk they could burst (rupture).\n\nh2 Symptoms of abdominal aortic aneurysm\n\nAbdominal aortic aneurysm often has no symptoms.\n\nYou usually only find out you have one during an abdominal aortic aneurysm screening test or during tests for another condition.\n\nIf an aneurysm gets bigger, you might sometimes notice:\n\ntummy or back pain\n\na pulsing feeling in your tummy\n\nh2 Non-urgent advice: See a GP if:\n\nyou have tummy or back pain that does not go away or keeps coming back\n\nyou feel a lump in your tummy\n\nThese symptoms can be caused by lots of things and do not mean you have an abdominal aortic aneurysm, but it's best to get them checked.\n\nh2 Immediate action r

In [None]:
# Get the first key
first_key = next(iter(final_data))
print(f"Condition name: {first_key}")
print(final_data[first_key])


Condition name: AAA, see Abdominal aortic aneurysm
{'chunks': [{'text': "h1 Abdominal aortic aneurysm\nAn abdominal aortic aneurysm (AAA) is a swelling in the aorta, the artery that carries blood from the heart to the tummy (abdomen). Most aneurysms do not cause any problems, but they can be serious because there's a risk they could burst (rupture).\n\nh1 Abdominal aortic aneurysm\nh2 Symptoms of abdominal aortic aneurysm\nAbdominal aortic aneurysm often has no symptoms.  You usually only find out you have one during an abdominal aortic aneurysm screening test or during tests for another condition.  If an aneurysm gets bigger, you might sometimes notice:  tummy or back pain  a pulsing feeling in your tummy\n\nh1 Abdominal aortic aneurysm\nh2 Non-urgent advice: See a GP if:\nyou have tummy or back pain that does not go away or keeps coming back  you feel a lump in your tummy  These symptoms can be caused by lots of things and do not mean you have an abdominal aortic aneurysm, but it's b

Removing duplicates

In [None]:
# Group chunks by text
grouped = defaultdict(list)

# Storing condition name and grouping by text
for condition, condition_data in final_data.items():
    for chunk in condition_data["chunks"]:
        chunk["condition_temp"] = condition
        grouped[chunk["text"]].append(chunk)

# Deduplicate
deduped_chunks = defaultdict(list)

for text, duplicates in grouped.items():

    if len(duplicates) == 1:
        # If no duplicate
        selected = duplicates[0]
    else:
        # if more than one value (different condition name) per text select longest name
        selected = max(duplicates, key=lambda c: len(c["condition_temp"]))
        print(f"condition: {selected['condition_temp']}")

    # Add to deduped chunks pile
    deduped_chunks[selected["condition_temp"]].append(selected)

# recreate final data
final_data_dedup = {}

for condition in final_data:
    chunks = deduped_chunks.get(condition, [])
    for chunk in chunks:
        chunk.pop("condition_temp", None)
    final_data_dedup[condition] = {
        "chunks": chunks,
        "metadata": final_data[condition].get("metadata", {})
    }

In [None]:
# from collections import defaultdict

# text_map = defaultdict(list)

# for condition, data in final_data.items():
#     for chunk in data["chunks"]:
#         text_map[chunk["text"]].append((condition, chunk["source"]))


##Building and Exporting Corpus

In [None]:
corpus = [
    {
        "id": f"{condition}_{i}",
        "text": chunk["text"],
        "source": chunk["source"],
        "review_info": chunk["review_info"]
    }
    for condition, data in final_data_dedup.items()
    for i, chunk in enumerate(data["chunks"])
]

with open("corpus.json", "w") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)


In [None]:
def format_document(text):
    lines = text.splitlines()
    output = []
    in_list = False

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Formatting headings
        if line.startswith("h1 "):
            output.append(f"Section: {line[3:].strip()}")
            in_list = False
        elif line.startswith("h2 "):
            output.append(f"Subsection: {line[3:].strip()}")
            in_list = False
        elif line.startswith("h3 "):
            output.append(f"Subsubsection: {line[3:].strip()}")
            in_list = False
        # Formatting lists (ending without bullet points)
        elif not line.endswith(".") and line[0].islower():
            output.append(f"- {line}")
        else:
            output.append(line)
            in_list = False

    formatted = "\n".join(output)

    # Removing spaces before punctuation
    formatted = re.sub(r'\s+([.,!?;:])', r'\1', formatted)

    return formatted

# Apply to each text
for doc in corpus:
    doc["text"] = format_document(doc["text"])

In [None]:
# Preview formatted text
for i, doc in enumerate(corpus[:7]):
    print(f"Document {i+1}")
    print(doc["text"])
    print("-" * 100)

Document 1
Section: Acanthosis nigricans
Acanthosis nigricans is the name for dry, dark patches of skin that usually appear in the armpits, neck or groin. It could be a sign of an underlying condition, so it needs to be checked by a GP.
Section: Acanthosis nigricans
Subsection: Symptoms of acanthosis nigricans
The main symptom of acanthosis nigricans is patches of skin that are darker and thicker than usual.  They can appear anywhere on the body.  figcaption The patches are dry and feel similar to velvet.  figcaption They're most common in skin folds, such as the armpits, neck or groin.  figcaption Some people also have tiny growths (skin tags) on the patches.  The patches often appear gradually without any other symptoms.  Sometimes the skin may be itchy.
Section: Acanthosis nigricans
Subsection: Non-urgent advice: See a GP if:
- you have new dark patches on your skin  you have any skin changes you're unsure about
Section: Acanthosis nigricans
Subsection: What happens at your appointm

In [None]:
save_path = '/content/drive/MyDrive/corpus.json'

with open(save_path, "w") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)

print(f"Corpus saved to {save_path}")

Corpus saved to /content/drive/MyDrive/corpus.json
