# I. Scrapping & Semantic chunking

In [9]:
import os
import json
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import fitz  
import requests
from urllib.parse import urlparse

In [103]:
def WHO_scrapping(save_path, disease, desired_sections):
    url = "https://www.who.int/news-room/fact-sheets/detail/" + disease
    file_name = "who_0.json"
    file_path = Path(save_path) / file_name

    try:
        if file_path.exists():
            print(f"Already exists, skipping: {file_name}")
            return

        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract specific h2 sections
        article = soup.find("section", {"id": "content"})
        extracted_sections = {}

        if article:
            headers = article.find_all("h2")
            for h2 in headers:
                title = h2.get_text(strip=True).lower()
                for key in desired_sections:
                    if key in title:
                        content = []
                        for sibling in h2.find_next_siblings():
                            if sibling.name == "h2":
                                break
                            # Handle <p> and <li> tags with proper formatting
                            if sibling.name == "p":
                                text = sibling.get_text(separator=" ", strip=True)
                                content.append(text)
                            elif sibling.name == "ul":
                                items = sibling.find_all("li")
                                for item in items:
                                    item_text = item.get_text(separator=" ", strip=True)
                                    content.append(item_text + ",")
                        combined = " ".join(content)
                        # Clean up extra whitespace and Unicode chars
                        cleaned = (
                            combined.replace("\t", " ")
                            .replace("\xa0", " ")
                            .replace(" ,", ",")
                            .replace("  ", "")
                            .replace(".,", ",")
                            .strip()
                        )
                        extracted_sections[key.capitalize()] = cleaned
                        break

        extracted_sections["url"] = url
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(extracted_sections, f, ensure_ascii=False, indent=2)

        print(f"Scraped and saved: {file_path}")

    except Exception as e:
        print(f"Failed to download or parse: {e}")

In [152]:
def CDC_scrapping(save_path, disease, desired_pages):
    base_url = "https://www.cdc.gov/" + disease
    id = 0

    for sub_path, sections in desired_pages.items():
        url = base_url + sub_path
        file_name = f"cdc_{id}.json"
        file_path = Path(save_path) / file_name

        try:
            if file_path.exists():
                print(f"Already exists, skipping: {file_name}")
                id += 1
                continue

            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            extracted_sections = {}

            # All <div class="dfe-section">
            all_sections = soup.find_all("div", class_="dfe-section")

            for div in all_sections:
                h2 = div.find("h2")
                if not h2:
                    continue

                title = h2.get_text(strip=True).lower()

                for desired in sections:
                    if desired.lower() in title:
                        content_parts = []

                        # Include h3s, ps, and lis
                        for tag in div.find_all(["h3", "p", "li"]):
                            txt = tag.get_text(separator=" ", strip=True)
                            if txt:
                                content_parts.append(txt)

                        # Join and clean
                        combined = ", ".join(content_parts)
                        cleaned = (
                            combined.replace("\t", " ")
                            .replace("\xa0", " ")
                            .replace(" .", ".")
                            .replace(" ,", ",")
                            .replace("  ", " ")
                            .replace(".,", ".")
                            .replace(":,", ":")
                            .replace("\"", "")
                            .replace("”", "")
                            .replace("“", "")
                            .strip()
                        )

                        extracted_sections[desired] = cleaned
                        break

            extracted_sections["url"] = url
            # Save to JSON
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(extracted_sections, f, ensure_ascii=False, indent=2)

            print(f"Scraped and saved: {file_path}")
        
        except Exception as e:
            print(f"Failed to download or parse {url}: {e}")
        
        id += 1  # Always increment after processing one URL

In [230]:
def NIH_scrapping(save_path, disease, desired_pages):
    base_url = "https://www.nhlbi.nih.gov/health/" + disease
    id = 0

    for sub_path in desired_pages:
        url = base_url + sub_path
        file_name = f"nih_{id}.json"
        file_path = Path(save_path) / file_name

        try:
            if file_path.exists():
                print(f"Already exists, skipping: {file_name}")
                id += 1
                continue

            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            extracted_sections = {"url": url}

            main_title_tag = soup.find("h1")
            
            if main_title_tag:
                # Remove all <span> tags
                for span in main_title_tag.find_all("span"):
                    span.decompose()
                main_title = main_title_tag.get_text(strip=True)
            else:
                main_title = "No Title"
            container = soup.find("div", class_="field--name-field-component-sections")

            if not container:
                print(f"No component sections found in {url}")
                id += 1
                continue

            components = container.find_all("div", class_="paragraph--type--component-section")
            for comp in components:
                title_tag = comp.find("h2", class_="component-section-section-title")
                content_tag = comp.find("div", class_="field--name-field-component-section-content")

                if not content_tag:
                    continue  

                html_content = (
                    content_tag.get_text(separator=" ", strip=True)
                    .replace("\t", " ")
                    .replace("\xa0", " ")
                    .replace(" ,", ",")
                    .replace("  ", "")
                    .replace(".,", ",")
                    .strip()
                )
                if title_tag:
                    title = title_tag.get_text(strip=True)
                    extracted_sections[title] = html_content
                else:
                    extracted_sections[main_title] = html_content

            # Save to JSON
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(extracted_sections, f, ensure_ascii=False, indent=2)

            print(f"Scraped and saved: {file_path}")

        except Exception as e:
            print(f"Failed to download or parse {url}: {e}")
        
        id += 1

## 1. Asthma

In [13]:
ASTHMA_RAW_DIR = Path("raw_files/asthma")
os.makedirs(ASTHMA_RAW_DIR, exist_ok=True)

In [232]:
WHO_scrapping(ASTHMA_RAW_DIR, "asthma", ["overview", "impact", "symptoms", "causes", "treatment", "self-care"])

CDC_scrapping(ASTHMA_RAW_DIR, "asthma/", {"about": ["symptoms", "diagnosis", "symptom management"],
                                          "control": ["Common asthma triggers"],
                                          "emergency": ["First steps", "Facing challenges"]
                                         })

NIH_scrapping(ASTHMA_RAW_DIR, "asthma/", ["", "symptoms", "attacks", "causes", "diagnosis", "treatment-action-plan", "living-with"])

Already exists, skipping: who_0.json
Already exists, skipping: cdc_0.json
Already exists, skipping: cdc_1.json
Already exists, skipping: cdc_2.json
Scraped and saved: raw_files\asthma\nih_0.json
Scraped and saved: raw_files\asthma\nih_1.json
Scraped and saved: raw_files\asthma\nih_2.json
Scraped and saved: raw_files\asthma\nih_3.json
Scraped and saved: raw_files\asthma\nih_4.json
Scraped and saved: raw_files\asthma\nih_5.json
Scraped and saved: raw_files\asthma\nih_6.json


## 2. COPD

In [56]:
COPD_RAW_DIR = Path("raw_files/chronic-obstructive-pulmonary-disease-copd")
os.makedirs(COPD_RAW_DIR, exist_ok=True)

In [234]:
WHO_scrapping(COPD_RAW_DIR, "chronic-obstructive-pulmonary-disease-(copd)", 
              ["overview", "symptoms", "causes", "treatment", "living with copd"])

CDC_scrapping(COPD_RAW_DIR, "copd/", {"about": ["What it is", "symptoms", "Complications", "Causes and risk factors", "Reducing risk", 
                                                "Who is at risk", "Diagnosis", "Treatment and management"]})

NIH_scrapping(COPD_RAW_DIR, "copd/", ["", "symptoms", "causes", "diagnosis", "prevention", "treatment", "living-with"])

Already exists, skipping: who_0.json
Already exists, skipping: cdc_0.json
Scraped and saved: raw_files\chronic-obstructive-pulmonary-disease-copd\nih_0.json
Scraped and saved: raw_files\chronic-obstructive-pulmonary-disease-copd\nih_1.json
Scraped and saved: raw_files\chronic-obstructive-pulmonary-disease-copd\nih_2.json
Scraped and saved: raw_files\chronic-obstructive-pulmonary-disease-copd\nih_3.json
Scraped and saved: raw_files\chronic-obstructive-pulmonary-disease-copd\nih_4.json
Scraped and saved: raw_files\chronic-obstructive-pulmonary-disease-copd\nih_5.json
Scraped and saved: raw_files\chronic-obstructive-pulmonary-disease-copd\nih_6.json


## 3. Pneumonia

In [94]:
PNEUMONIA_RAW_DIR = Path("raw_files/pneumonia")
os.makedirs(PNEUMONIA_RAW_DIR, exist_ok=True)

In [236]:
# WHO_scrapping(COPD_RAW_DIR, "chronic-obstructive-pulmonary-disease-(copd)", 
#               ["overview", "symptoms", "causes", "treatment", "living with copd"])
CDC_scrapping(PNEUMONIA_RAW_DIR, "pneumonia/", {"about": ["Overview", "symptoms", "Types", "Who is at risk", "Causes", "Prevention"],
                                           "risk-factors": ["People at increased risk", "Conditions that can increase risk",
                                                            "Behaviors that can increase risk"],
                                           "prevention": ["Prevention steps and strategies"]
                                          })

NIH_scrapping(PNEUMONIA_RAW_DIR, "pneumonia/", ["", "symptoms", "causes", "diagnosis", "prevention", "treatment", "recovery"])

Already exists, skipping: cdc_0.json
Already exists, skipping: cdc_1.json
Already exists, skipping: cdc_2.json
Scraped and saved: raw_files\pneumonia\nih_0.json
Scraped and saved: raw_files\pneumonia\nih_1.json
Scraped and saved: raw_files\pneumonia\nih_2.json
Scraped and saved: raw_files\pneumonia\nih_3.json
Scraped and saved: raw_files\pneumonia\nih_4.json
Scraped and saved: raw_files\pneumonia\nih_5.json
Scraped and saved: raw_files\pneumonia\nih_6.json


## 4. Tuberculosis

In [84]:
TUBERCULOSIS_RAW_DIR = Path("raw_files/tuberculosis")
os.makedirs(TUBERCULOSIS_RAW_DIR, exist_ok=True)

In [162]:
WHO_scrapping(TUBERCULOSIS_RAW_DIR, "tuberculosis", 
              ["overview", "symptoms", "treatment", "prevention", "diagnosis", "impact"])

CDC_scrapping(TUBERCULOSIS_RAW_DIR, "tb/", {"about": ["Overview", "Signs and symptoms", "Types", "Risk factors", "How it spreads", 
                                                      "Prevention", "Testing", "Treatment", "Vaccines"],
                                           "signs-symptoms": ["Signs and symptoms"],
                                           "causes": ["Causes", "How it spreads"],
                                           "vaccines": ["Overview"],
                                           "testing": ["Types of tests", "Why get tested", "Who should be tested", "What to do if you've tested positive"],
                                           "exposure": ["Contact your health care provider if you have been exposed to TB",
                                                        "Only persons with active TB disease can spread TB to others",
                                                        "Contact investigations can help limit the spread of TB"],
                                           "risk-factors": ["Places with increased risk", "Conditions that can increase risk"],
                                           "prevention": ["Prevention steps and strategies"]
                                          })

Already exists, skipping: who_0.json
Scraped and saved: raw_files\tuberculosis\cdc_0.json
Scraped and saved: raw_files\tuberculosis\cdc_1.json
Scraped and saved: raw_files\tuberculosis\cdc_2.json
Scraped and saved: raw_files\tuberculosis\cdc_3.json
Scraped and saved: raw_files\tuberculosis\cdc_4.json
Scraped and saved: raw_files\tuberculosis\cdc_5.json
Scraped and saved: raw_files\tuberculosis\cdc_6.json
Scraped and saved: raw_files\tuberculosis\cdc_7.json


## 5. Covid

In [80]:
COVID_RAW_DIR = Path("raw_files/coronavirus-disease-(covid-19)")
os.makedirs(COVID_RAW_DIR, exist_ok=True)

In [164]:
WHO_scrapping(COVID_RAW_DIR, "coronavirus-disease-(covid-19)", 
              ["overview", "symptoms", "treatment", "prevention"])

CDC_scrapping(COVID_RAW_DIR, "covid/", {"about": ["Learn about COVID-19 and how it spreads"],
                                        "signs-symptoms": ["Signs and symptoms", "When to seek emergency help", "Difference between flu and COVID-19"],
                                        "risk-factors": ["Overview", "Conditions that can increase risk"],
                                        "testing": ["Types of tests", "Choosing a COVID-19 test", "Interpreting your results"],
                                        "treatment": ["COVID-19 Treatment Options", "Preventing COVID-19"],
                                        "prevention": ["Core Prevention Strategies", "What to watch out for"]
                                        })

Already exists, skipping: who_0.json
Scraped and saved: raw_files\coronavirus-disease-(covid-19)\cdc_0.json
Scraped and saved: raw_files\coronavirus-disease-(covid-19)\cdc_1.json
Scraped and saved: raw_files\coronavirus-disease-(covid-19)\cdc_2.json
Scraped and saved: raw_files\coronavirus-disease-(covid-19)\cdc_3.json
Scraped and saved: raw_files\coronavirus-disease-(covid-19)\cdc_4.json
Scraped and saved: raw_files\coronavirus-disease-(covid-19)\cdc_5.json
