<div align="center"><img src="../../images/LKYCIC_Header.jpg"></div>

# Extracting and Processing Dengue Cluster Data from NEA Website

## Step 1: Import necessary libraries

1. Importing **datetime** to get the current timestamp
2. Importing **requests** for sending HTTP requests
3. Importing **BeautifulSoup** for parsing HTML content
4. Importing **pandas** for data manipulation and saving results

In [None]:
%pip install requests beautifulsoup4 pandas

In [1]:
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Step 2: Fetch the current timestamp
now = datetime.now()

# transform the timestamp into a string
now_str = now.strftime("%Y_%m%d_%H%M")
print("Current Timestamp:", now_str)

Current Timestamp: 2025_0120_1640


In [3]:
# Step 3: Define the URL of the webpage to scrape
url = "https://www.nea.gov.sg/dengue-zika/dengue/dengue-clusters"

# Step 4: Send an HTTP GET request to fetch the HTML content
response = requests.get(url)

# Check if the request was successful
response.raise_for_status()  # Raises an exception if an HTTP error occurs

# Print a success message
print("HTML content successfully fetched!")

# Step 5: Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
print("HTML content parsed successfully!")

HTML content successfully fetched!
HTML content parsed successfully!


In [4]:
# Step 6: Extract the date from the HTML paragraph element
# Locate the paragraph containing the date
info_paragraph = soup.find("p", text=lambda t: t and "accurate as at" in t)

# Extract the date using string manipulation if the paragraph exists
if info_paragraph:
    date_text = info_paragraph.text.strip()
    extracted_date = date_text.split("accurate as at")[-1].strip()
    print("Extracted Date:", extracted_date)
else:
    extracted_date = None
    print("Date information not found.")

extracted_date = extracted_date.replace(" ", "_") # Replace spaces with underscores

Extracted Date: 17 Jan 2025


  info_paragraph = soup.find("p", text=lambda t: t and "accurate as at" in t)


In [5]:
# create a folder using the extracted date under the data folder
folder_name = f"../data/{now_str}"
print("Folder Name:", folder_name)

import os

# Create a new directory to store the data
os.makedirs(folder_name, exist_ok=True)

Folder Name: ../data/2025_0120_1640


In [6]:
# Step 6: Extract and process subtables (class 'locality-details')
rows = []  # List to store extracted rows

# Locate all subtables
for subtable in soup.find_all("table", class_="locality-details"):
    if subtable:
        # Extract table headers
        headers = [th.text.strip() for th in subtable.find_all("th")]

        # Extract each row's data
        for row in subtable.find_all("tr"):
            cells = [cell.text.strip() for cell in row.find_all("td")]
            if cells:  # Skip rows without data
                rows.append(cells)

        # Remove the processed subtable from the HTML document
        subtable.decompose()

# Save extracted subtables to a DataFrame
if rows:
    subtable_df = pd.DataFrame(rows, columns=headers)
    subtable_df.to_csv(f"{folder_name}/dengue_clusters_with_subtables_{extracted_date}.csv", index=False)
    print("Subtables processed and saved as 'dengue_clusters_with_subtables.csv'.")
else:
    print("No subtables found or extracted.")

Subtables processed and saved as 'dengue_clusters_with_subtables.csv'.


In [7]:
# Step 7: Extract and process the main table (id 'surveillance-table')
main_table = soup.find("table", id="surveillance-table")

if main_table:
    # Extract headers for the main table
    headers = [th.text.strip() for th in main_table.find_all("th")]

    # Extract rows of the main table
    rows = []
    for row in main_table.find_all("tr"):
        cells = [cell.text.strip() for cell in row.find_all("td")]
        if cells:  # Skip rows without data
            rows.append(cells)

    # Save the main table data to a DataFrame
    main_table_df = pd.DataFrame(rows, columns=headers)
    main_table_df.to_csv(f"{folder_name}/dengue_clusters_without_subtables_{extracted_date}.csv", index=False)
    print("Main table processed successfully and saved as 'dengue_clusters_without_subtables.csv'.")
else:
    print("Main table with id 'surveillance-table' not found on the page.")

# Step 8: Completion message
print("Data extraction process completed successfully!")

Main table processed successfully and saved as 'dengue_clusters_without_subtables.csv'.
Data extraction process completed successfully!
