<a href="https://www.kaggle.com/code/emmanuelniyioriolowo/data-ingestion-extended?scriptVersionId=283336803" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Initial Setup

In [2]:
# Install required PDF processing libraries
!pip install reportlab
!pip install fpdf
!pip install pdfplumber

Collecting reportlab
  Downloading reportlab-4.4.5-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.5-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.5
Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=2b395aa181897d55d5507ba84ccb921fa3aac0722b4ec7af87990bdc7fc49cc7
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 

In [3]:
# Standard library imports
import os
import re
import datetime as dt
import shutil
from urllib.parse import urljoin

# Third-party libraries
import requests
import pdfplumber
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from reportlab.pdfgen import canvas
from fpdf import FPDF
from typing import Optional, Dict, Any, Tuple

In [4]:
# Base configuration for dataset extraction
BASE_URL = "https://ncdc.gov.ng"
HTML_FILE = "/kaggle/input/ncdc-table/ncdc_lassa_fever_html_table.html"
SAVE_DIR = "/kaggle/working/ncdc_reports"

# Starting point for serial number and week tracking
START_SN = 307      # Serial number for Week 1, 2020
START_YEAR = 2020
START_WEEK = 1

# Make and ensure output directory exists
os.makedirs(SAVE_DIR, exist_ok=True)

## URL extraction from HTML file

In [5]:
def insert_missing_week_clean(pdf_data, after_sn, missing_week, year=None):
    """
    Inserts a missing weekly report into a sequential (SN-sorted) list.
    
    The function:
    - Locates the entry with serial number `after_sn`
    - Inserts a placeholder record for `missing_week` directly after it
    - Shifts all subsequent serial numbers by +1 to keep numbering consistent
    
    Args:
        pdf_data (list): List of tuples -> (sn, title, url)
        after_sn (int): Serial number after which the missing week is inserted
        missing_week (int): Epi week number to insert
        year (int, optional): Year for the missing entry title
    
    Returns:
        list: Updated list with corrected SN sequence
    """
    if year is None:
        year = dt.datetime.now().year

    new_data = []
    inserted = False

    for i, (sn, title, url) in enumerate(pdf_data):
        # Append current entry before checking for insertion point
        new_data.append((sn, title, url))

        # Insert placeholder immediately after the target SN
        if sn == after_sn and not inserted:
            missing_title = (
                f"An update of Lassa fever outbreak in Nigeria for Week {missing_week}, {year}"
            )

            # Add the missing week entry
            new_data.append((after_sn + 1, missing_title, None))
            inserted = True

            # Shift all subsequent SN values by +1
            for remaining_sn, remaining_title, remaining_url in pdf_data[i + 1:]:
                new_data.append((remaining_sn + 1, remaining_title, remaining_url))
            break

    return new_data

In [6]:
# Load saved HTML table from local file
with open(HTML_FILE, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")  # parse HTML with BeautifulSoup

# Get all table rows, skipping header
rows = soup.find_all("tr")[1:]  # skip header row

# Initialize list to store serial number (SN) and PDF link info
pdf_data = []

# Extract SN, title, and PDF URL from each row
for row in rows:
    cols = row.find_all("td")
    if len(cols) >= 3:
        sn = cols[0].get_text(strip=True)  # extract serial number
        title = cols[1].get_text(strip=True)  # extract report title
        link_tag = cols[2].find("a", href=True)  # find anchor tag in third column
        if link_tag:
            pdf_url = urljoin(BASE_URL, link_tag['href'])  # construct full PDF URL
            pdf_data.append((int(sn), title, pdf_url))

# Sort the PDF data by serial number ascending
pdf_data.sort(key=lambda x: x[0])

# Output total number of PDFs found
print(f"Total PDFs found: {len(pdf_data)}")

# Preview relevant entries
# for sn, title, url in pdf_data[:306]:
for sn, title, url in pdf_data[:9]:
    print(f"SN {sn} | Title: {title} | URL: {url}")

Total PDFs found: 442
SN 1 | Title: An update of Lassa fever outbreak in Nigeria for Week 46 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/65b81d453825965fc88b202a61101c4f.pdf
SN 2 | Title: An update of Lassa fever outbreak in Nigeria for Week 45 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/471dd6384cd4f288167c0881efee40ee.pdf
SN 3 | Title: An update of Lassa fever outbreak in Nigeria for Week 44 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/9910b5860f5b9c377cf14a8992d6905b.pdf
SN 4 | Title: An update of Lassa fever outbreak in Nigeria for Week 43 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/55b33161bbfe39fd72039808875e5c45.pdf
SN 5 | Title: An update of Lassa fever outbreak in Nigeria for Week 42 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/91f559d691c7cd411f7058f1496eef6f.pdf
SN 6 | Title: An update of Lassa fever outbreak in Nigeria for Week 41 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/68f0ee9000f3084b74ed892be0d3f79f.pdf


**On inspection of the URLS there is a missing issing upload at index 181 ----> 2022 W 22**

In [7]:
for sn, title, url in pdf_data[179:182]:
    print(f"SN {sn} | Title: {title} | URL: {url}")

SN 180 | Title: An update of Lassa fever outbreak in Nigeria for Week 23 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/415ea980abafa9fb2547cbcd0362004e.pdf
SN 181 | Title: An update of Lassa fever outbreak in Nigeria for Week 21 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/6ce166b15bffc6986d356448d48d6a34.pdf
SN 182 | Title: An update of Lassa fever outbreak in Nigeria for Week 20 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/e1e22accee2e5c3272b863084821ff16.pdf


In [8]:
# create a placeholder 
pdf_data = insert_missing_week_clean(pdf_data, after_sn=180, missing_week=22)

In [9]:
for sn, title, url in pdf_data[178:183]:
    print(f"SN {sn} | Title: {title} | URL: {url}")

SN 179 | Title: An update of Lassa fever outbreak in Nigeria for Week 24 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/317f89c26656c4b549e07e84a720db5b.pdf
SN 180 | Title: An update of Lassa fever outbreak in Nigeria for Week 23 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/415ea980abafa9fb2547cbcd0362004e.pdf
SN 181 | Title: An update of Lassa fever outbreak in Nigeria for Week 22, 2025 | URL: None
SN 182 | Title: An update of Lassa fever outbreak in Nigeria for Week 21 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/6ce166b15bffc6986d356448d48d6a34.pdf
SN 183 | Title: An update of Lassa fever outbreak in Nigeria for Week 20 | URL: https://ncdc.gov.ng/themes/common/files/sitreps/e1e22accee2e5c3272b863084821ff16.pdf


## PDF file download

In [10]:
# Helper functions for downloading PDFs

def has_53_weeks(year: int) -> bool:
    """
    Check if a given year has 53 ISO weeks.
    
    Args:
        year (int): Year to check.
        
    Returns:
        bool: True if year has 53 weeks, else False.
    """
    # ISO calendar: week number of Dec 31 determines if year has 53 weeks
    return dt.date(year, 12, 31).isocalendar()[1] == 53


def download_pdf(url: str, filename: str) -> bool:
    """
    Download a PDF from a URL and save it locally.
    
    Args:
        url (str): URL of the PDF.
        filename (str): Name to save the PDF as.
        
    Returns:
        bool: True if download succeeds, False otherwise.
    """
    try:
        response = requests.get(url, timeout=30)  # fetch PDF
        response.raise_for_status()  # raise error for bad status
        with open(os.path.join(SAVE_DIR, filename), 'wb') as f:
            f.write(response.content)  # save file
        print(f"✓ Downloaded: {filename}")
        return True
    except Exception as e:
        print(f"✗ Failed to download {filename}: {e}")
        return False


def create_placeholder_pdf(filename: str, week_info: str):
    """
    Create a placeholder PDF for weeks with missing reports.
    
    Args:
        filename (str): Name to save the PDF as.
        week_info (str): Week information to display in the PDF.
    """
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Report Not Available", ln=1, align='C')
    pdf.cell(200, 10, txt=f"Week {week_info}", ln=1, align='C')
    pdf.cell(200, 10, txt="Placeholder for missing report", ln=1, align='C')
    pdf.output(os.path.join(SAVE_DIR, filename))  # save PDF

In [11]:
"""
Process PDFs for all serial numbers (SN) from START_SN downward.
Downloads existing PDFs or creates placeholders if missing.
Calculates the correct year and ISO week, accounting for 52/53-week years.

For faster execution and smaller repository size, the default configuration downloads only 10 reports.
If you require the complete dataset, you may uncomment the relevant lines in the ingestion script to enable full PDF download. 
Users are encouraged to adjust this based on their computational needs and internet bandwidth.
"""

# Determine the actual minimum and maximum SN in the data
min_sn = min(sn for sn, _, _ in pdf_data)
max_sn = max(sn for sn, _, _ in pdf_data)

print(f"SN range in data: {min_sn} to {max_sn}")
print(f"\nProcessing SN range: {START_SN} to {min_sn}")

year = START_YEAR
week = START_WEEK

# Loop over SNs from START_SN downward
# for sn in range(START_SN, 0, -1): # uncomment when not testing
for sn in range(START_SN, START_SN - 10, -1): # for testing
    # Calculate week offset from START_SN

    
    # Adjust year and week if week exceeds total weeks in current year
    weeks_in_year = 53 if has_53_weeks(year) else 52

    # Construct filename for the PDF
    filename = f"{year}-W{week:02d}.pdf"
    
    # Find PDF info for this SN
    pdf_info = next((item for item in pdf_data if item[0] == sn), None)
    
    if pdf_info:
        # Download the PDF if it exists
        _, title, url = pdf_info
        print(f"SN {sn}: Downloading {filename} - {title}")
        success = download_pdf(url, filename)
        if not success:
            # Create placeholder if download failed
            print(f"SN {sn}: Creating placeholder for {filename} (download failed)")
            create_placeholder_pdf(filename, f"{year} Week {week}")
    else:
        # Create a placeholder PDF if missing
        print(f"SN {sn}: Creating placeholder for {filename} (missing)")
        create_placeholder_pdf(filename, f"{year} Week {week}")

    if week >= weeks_in_year:
        year += 1
        week -= weeks_in_year
    week += 1

print(f"\nProcessing complete!")
print(f"Files saved to: {SAVE_DIR}")

SN range in data: 1 to 443

Processing SN range: 307 to 1
SN 307: Downloading 2020-W01.pdf - An update of Lassa fever outbreak in Nigeria for Week 1
✓ Downloaded: 2020-W01.pdf
SN 306: Downloading 2020-W02.pdf - An update of Lassa fever outbreak in Nigeria for Week 2
✓ Downloaded: 2020-W02.pdf
SN 305: Downloading 2020-W03.pdf - An update of Lassa fever outbreak in Nigeria for Week 3
✓ Downloaded: 2020-W03.pdf
SN 304: Downloading 2020-W04.pdf - An update of Lassa fever outbreak in Nigeria for Week 4
✓ Downloaded: 2020-W04.pdf
SN 303: Downloading 2020-W05.pdf - An update of Lassa fever outbreak in Nigeria for Week 5
✓ Downloaded: 2020-W05.pdf
SN 302: Downloading 2020-W06.pdf - An update of Lassa fever outbreak in Nigeria for Week 6
✓ Downloaded: 2020-W06.pdf
SN 301: Downloading 2020-W07.pdf - An update of Lassa fever outbreak in Nigeria for Week 7
✓ Downloaded: 2020-W07.pdf
SN 300: Downloading 2020-W08.pdf - An update of Lassa fever outbreak in Nigeria for Week 8
✓ Downloaded: 2020-W08.pd

## PDF Data Extraction

### Helper Function Definition

In [12]:
# Helper functions 
def get_week_year_from_sn(sn: int) -> tuple[int, int]:
    """
    Calculate ISO year and week number from serial number (SN).
    
    Args:
        sn (int): Serial number (week count)
    
    Returns:
        tuple[int, int]: (year, week) corresponding to the SN
    """
    weeks_passed = sn - START_SN  # difference from starting SN
    current_date = dt.datetime(START_YEAR, 1, 1)
    
    # Find the first Thursday of the year (ISO week rule)
    while current_date.weekday() != 3:  # 3 = Thursday
        current_date += dt.timedelta(days=1)
    
    # Add weeks passed to get target date
    target_date = current_date + dt.timedelta(weeks=weeks_passed)
    year = target_date.isocalendar()[0]
    week = target_date.isocalendar()[1]
    
    return year, week


def get_week_start_and_end_date(year: int, week: int) -> tuple[dt.datetime, dt.datetime]:
    """
    Calculate the start (Monday) and end (Sunday) dates of a given ISO week.
    
    Args:
        year (int): ISO year
        week (int): ISO week number
    
    Returns:
        tuple(datetime, datetime): (week_start, week_end)
    """
    # Get first day of the year
    first_jan = dt.datetime(year, 1, 1)
    
    # Find the first Thursday of the year (ISO week 1 contains Jan 4th)
    while first_jan.weekday() != 3:  # 3 = Thursday
        first_jan += dt.timedelta(days=1)
    
    # Calculate Monday of the ISO week
    week_start = first_jan + dt.timedelta(weeks=week-1) - dt.timedelta(days=3)
    week_end = week_start + dt.timedelta(days=6)  # Sunday of the same week
    
    return week_start, week_end


In [13]:
def extract_table_from_pdf(pdf_path: str) -> list:
    """
    Extract tables from the first page of a PDF and return their content.
    
    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        list: Extracted tables as lists of rows (each row is a list of cell values).
    """
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]  # use first page
        
        # Settings to guide table extraction
        table_settings = {
            "vertical_strategy": "lines",         # use vertical lines for columns
            "horizontal_strategy": "lines",       # use horizontal lines for rows
            "explicit_vertical_lines": [],        # no manually specified vertical lines
            "explicit_horizontal_lines": [],      # no manually specified horizontal lines
            "snap_tolerance": 3,                  # tolerance for snapping edges
            "join_tolerance": 3,                  # tolerance for joining adjacent lines
            "edge_min_length": 3,                 # minimum line length to consider
            "min_words_vertical": 2,              # min words to form vertical segment
            "min_words_horizontal": 1,            # min words to form horizontal segment
        }
        
        tables = page.extract_tables(table_settings)  # extract tables
        return tables

In [14]:
def extract_case_data_from_table(data: list) -> Optional[Tuple[Any, Any, Any, Any]]:
    """
    Extract suspected, confirmed, probable cases, and deaths from a table.
    Searches for the row containing 'Current week' or 'Currentweek' and
    handles different table layouts.
    
    Args:
        data (list): Nested list representing the PDF table content.
    
    Returns:
        tuple or None: (suspected, confirmed, probable, deaths) if found, else None.
    """

    # --- recursive search for row containing "Current week" ---
    def search(lst: list) -> Optional[list]:
        for item in lst:
            if isinstance(item, list):
                found = search(item)
                if found:
                    return found
            elif isinstance(item, str) and ("Current week" in item or "Currentweek" in item):
                return lst
        return None

    row = search(data)
    
    # fallback for uncommon table patterns
    if not row:
        if len(data) == 5:
            try:
                section = data[3][6]
                return section[0], section[3], section[6], section[9]
            except Exception:
                return None
        return None

    # standard layout (row has consecutive values)
    if len(row) >= 5 and row[1] is not None:
        try:
            return row[1], row[2], row[3], row[4]
        except Exception:
            pass

    # sparse layout (values spread out across row)
    if len(row) >= 13:
        try:
            return row[3], row[6], row[9], row[12]
        except Exception:
            pass

    return None  # no valid data found

In [15]:
def extract_epi_week(pdf_path: str) -> Optional[str]:
    """
    Extract the Epi Week line from a PDF.
    Handles multiple formats by using regex patterns and checks first few pages.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str or None: Extracted Epi Week string if found, else None.
    """
    # Patterns to match different Epi Week formats
    patterns = [
        r"EpiWeek\s*\d+:\s*\d+[–-]\d+[A-Za-z]+\d{4}",                        # Compressed format: "EpiWeek20:17–23May2021"
        r"Epi Week\s*\d+:\s*\d+(?:st|nd|rd|th)?\s*[–-]\s*\d+(?:st|nd|rd|th)?\s*[A-Za-z]+\s*\d{4}",  # e.g., "Epi Week 30: 22nd – 28th July 2024"
        r"Epi Week\s*\d+:\s*\d+\s*[–-]\s*\d+\s*[A-Za-z]+\s*\d{4}",           # e.g., "Epi Week 29: 18 – 24 July 2022"
        r"Epi Week\s*\d+[^\n]{0,100}\d{4}",                                  # More general
        r"Epi Week[^\n]{0,150}",                                             # Even more general
    ]
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages[:3]):  # Check first 3 pages
                text = page.extract_text()
                
                if text:
                    # Clean text: remove extra whitespace
                    cleaned_text = ' '.join(text.split())
                    
                    # Search for patterns
                    for pattern in patterns:
                        match = re.search(pattern, cleaned_text, re.IGNORECASE)
                        if match:
                            return match.group().strip()
                    
                    # Fallback: line-by-line search for "Epi Week"
                    for line in text.split('\n'):
                        if 'Epi Week' in line:
                            return ' '.join(line.split())  # clean whitespace
                            
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None
    
    return None  # return None if not found


### Main

In [16]:
def get_data_from_pdf(pdf_path: str) -> Optional[Dict[str, Any]]:
    """
    Extract epidemiological data from a Lassa Fever PDF report.
    Retrieves Epi Week, suspected, confirmed, probable cases, and deaths.
    Handles multiple table formats.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        dict or None: Dictionary with keys:
            - 'epi_week'
            - 'suspected_cases'
            - 'confirmed_cases'
            - 'probable_cases'
            - 'deaths'
        Returns None if the PDF is missing or cannot be processed.
    """
    try:
        # Extract Epi Week string
        epi_week = extract_epi_week(pdf_path)

        # Extract table content and parse case numbers
        table_content = extract_table_from_pdf(pdf_path)
        tup = extract_case_data_from_table(table_content)
        
        # Return structured data as dictionary
        return {
            'epi_week': epi_week,
            'suspected_cases': tup[0],
            'confirmed_cases': tup[1],
            'probable_cases': tup[2],
            'deaths': tup[3]
        }

    except FileNotFoundError:
        print(f"Error: File not found at {pdf_path}")
        return None
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None

In [17]:
def process_all_reports(directory_path: str) -> pd.DataFrame:
    """
    Process all PDF files in a directory and extract epidemiological data
    for time series analysis.

    Args:
        directory_path (str): Path to directory containing PDF files.

    Returns:
        pd.DataFrame: DataFrame with columns including week start/end dates,
        Epi Week info, cases (suspected, confirmed, probable, deaths),
        extraction method, and timestamp.
    """
    data_records = []

    # Get all PDF files and sort by filename
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    pdf_files.sort()  # ensures chronological order like 2020-W01.pdf, 2020-W02.pdf, etc.

    print(f"Found {len(pdf_files)} PDF files to process")

    for filename in pdf_files:
        # Extract year and week from filename
        match = re.match(r'(\d{4})-W(\d{2})\.pdf', filename)
        if not match:
            print(f"Skipping invalid filename: {filename}")
            continue

        year = int(match.group(1))
        week = int(match.group(2))

        # Calculate start (Monday) and end (Sunday) dates of the ISO week
        week_start_date, week_end_date = get_week_start_and_end_date(year, week)

        # Full path to PDF
        pdf_path = os.path.join(directory_path, filename)

        # Extract data from PDF
        result = get_data_from_pdf(pdf_path)

        if result:
            # Successful extraction
            record = {
                'week_start_date': week_start_date,
                'week_end_date': week_end_date,
                'epi_year': year,
                'epi_week': week,
                'filename': filename,
                'epi_week_info': result['epi_week'],
                'suspected_cases': result['suspected_cases'],
                'confirmed_cases': result['confirmed_cases'],
                'probable_cases': result['probable_cases'],
                'deaths': result['deaths'],
                'extraction_method': "automated",
                'extraction_timestamp': dt.datetime.now().isoformat(),
            }
            data_records.append(record)
            print(f"✓ Processed {filename}: {result['confirmed_cases']} confirmed cases")
        else:
            # For placeholder files or failed extraction
            record = {
                'week_start_date': week_start_date,
                'week_end_date': week_end_date,
                'epi_year': year,
                'epi_week': week,
                'filename': filename,
                'epi_week_info': None,
                'suspected_cases': None,
                'confirmed_cases': None,
                'probable_cases': None,
                'deaths': None,
                'extraction_method': "manual",
                'extraction_timestamp': None,
            }
            data_records.append(record)
            print(f"✗ Failed to extract data from {filename}")

    # Create DataFrame
    df = pd.DataFrame(data_records)

    # Sort by week start date to ensure chronological order
    df = df.sort_values('week_start_date').reset_index(drop=True)

    return df


# ------------------ Main execution ------------------
if __name__ == "__main__":
    directory_path = "/kaggle/working/ncdc_reports"

    # Process all reports in the directory
    final_df = process_all_reports(directory_path)

    # Display summary
    print("\n" + "="*50)
    print("EXTRACTED DATA SUMMARY")
    print("="*50)
    print(f"Total records: {len(final_df)}")
    print(f"Records with case data: {final_df['confirmed_cases'].notna().sum()}")
    print(f"Date range: {final_df['week_start_date'].min()} to {final_df['week_start_date'].max()}")

Found 10 PDF files to process
✓ Processed 2020-W01.pdf:  confirmed cases
✓ Processed 2020-W02.pdf:  confirmed cases
✓ Processed 2020-W03.pdf:  confirmed cases
✓ Processed 2020-W04.pdf: 95 confirmed cases
✓ Processed 2020-W05.pdf: 104 confirmed cases
✓ Processed 2020-W06.pdf: 109 confirmed cases
✓ Processed 2020-W07.pdf: 115 confirmed cases
✓ Processed 2020-W08.pdf: 102 confirmed cases
✓ Processed 2020-W09.pdf: 85 confirmed cases
✓ Processed 2020-W10.pdf: 81 confirmed cases

EXTRACTED DATA SUMMARY
Total records: 10
Records with case data: 10
Date range: 2019-12-30 00:00:00 to 2020-03-02 00:00:00


### Export / Save Processed Data

In [18]:
# Add PDF URLs to final DataFrame 
urls = []

# Start from START_SN - 1 to match the first PDF in pdf_data
index = START_SN - 1

for sn in range(len(final_df)):
    appropriate_url = pdf_data[index][2]
    urls.append(appropriate_url)

    index -= 1 
    
final_df["report_pdf_url"] = urls

In [19]:
# Save to CSV
csv_path = "/kaggle/working/lassa_fever_timeseries.csv"
final_df.to_csv(csv_path, index=False)
print(f"\nData saved to: {csv_path}")


Data saved to: /kaggle/working/lassa_fever_timeseries.csv


## Cleanup / Additional Formatting

In [20]:
# create zip file 

directory_path = "/kaggle/working/ncdc_reports"
zip_filename = "/kaggle/working/ncdc_reports.zip"

# Create zip file
shutil.make_archive(
    base_name="/kaggle/working/ncdc_reports", 
    format='zip',
    root_dir=directory_path
)

print(f"Zip file created: {zip_filename}")

Zip file created: /kaggle/working/ncdc_reports.zip


In [21]:
# delete all files 
import os
import glob

directory_path = "/kaggle/working/ncdc_reports"

files = glob.glob(os.path.join(directory_path, "*"))
for file in files:
    if os.path.isfile(file):
        os.remove(file)
        print(f"Deleted: {file}")

print("All files deleted!")

Deleted: /kaggle/working/ncdc_reports/2020-W06.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W04.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W05.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W03.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W09.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W07.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W02.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W01.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W10.pdf
Deleted: /kaggle/working/ncdc_reports/2020-W08.pdf
All files deleted!


In [22]:
from IPython.display import FileLink
FileLink('ncdc_reports.zip')