# PDF Sourcing

In [None]:
!pip install requests beautifulsoup4

In [None]:
import os
import requests
from bs4 import BeautifulSoup

def download_lassa_pdfs():
    # The base URL that hosts the PDFs
    base_url = "https://ncdc.gov.ng"
    # The specific page that lists all the Lassa fever situation reports
    list_page_url = (
        "https://ncdc.gov.ng/diseases/sitreps/?cat=5&name=An%20update%20of%20Lassa%20fever%20outbreak%20in%20Nigeria"
    )
    
    # Create a local folder to store the PDFs
    os.makedirs("pdfs", exist_ok=True)
    
    # 1. Fetch the HTML
    print(f"Fetching list page: {list_page_url}")
    response = requests.get(list_page_url)
    response.raise_for_status()  # raise an error if the HTTP request failed
    
    # 2. Parse the HTML
    soup = BeautifulSoup(response.text, "html.parser")
    
    # The table is inside <tbody>. Each row has multiple <td>, 
    # and the third <td> has the <a> with the PDF link
    table_body = soup.find("tbody")
    if not table_body:
        print("Could not find <tbody> on the page.")
        return
    
    rows = table_body.find_all("tr")
    if not rows:
        print("No <tr> found inside <tbody>.")
        return

    # For stats
    total_found = 0
    total_downloaded = 0
    
    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 3:
            # We expect 3 <td> in each row: (1) index, (2) description, (3) the PDF link
            continue
        
        # The PDF link is in the third cell; let's get the <a>:
        link_tag = cells[2].find("a", href=True)
        if not link_tag:
            continue
        
        # The PDF URL is relative, e.g. "/themes/common/files/sitreps/..."
        # We need to prepend https://ncdc.gov.ng
        pdf_url = link_tag["href"]
        if pdf_url.startswith("/"):
            pdf_url = base_url + pdf_url
        
        # The "download" attribute often has the suggested filename
        # or we can parse from the final part of the URL
        download_name = link_tag.get("download")  # e.g. "An update of Lassa fever ... .pdf"
        
        if not download_name:
            # Fallback: parse the filename from the URL
            download_name = pdf_url.split("/")[-1]

        # Clean up the download name if needed
        download_name = download_name.replace(" ", "_")

        total_found += 1
        # 3. Download the PDF
        # We'll skip if it already exists. Or you can overwrite by removing the check.
        local_path = os.path.join("pdfs", download_name)
        if os.path.exists(local_path):
            print(f"Already downloaded: {download_name}")
            continue
        
        print(f"Downloading {pdf_url} -> {local_path}")
        try:
            pdf_response = requests.get(pdf_url)
            pdf_response.raise_for_status()
            with open(local_path, "wb") as f:
                f.write(pdf_response.content)
            total_downloaded += 1
        except Exception as e:
            print(f"Failed to download {pdf_url}: {e}")

    print(f"Found {total_found} PDF links total. Downloaded {total_downloaded} new PDFs.")

if __name__ == "__main__":
    download_lassa_pdfs()


In [None]:
import os

pdf_files = os.listdir('PDFs')
print(pdf_files)

In [None]:
import os
import re
from pathlib import Path

def rename_lassa_files(folder_path):
    """
    Renames 'An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf'
    to 'Nigeria_04_Nov_24_W45.pdf', extracting day=04, month=11 => 'Nov', year=24,
    and the week number 45.
    
    Args:
        folder_path (str): Path to the folder that contains the PDF files.
    """
    # For mapping month number to short name
    month_map = {
        "01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
        "05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
        "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec",
    }

    folder = Path(folder_path)
    for file_path in folder.iterdir():
        if not file_path.is_file():
            continue
        if not file_path.suffix.lower() == ".pdf":
            continue
        
        old_name = file_path.name
        # Example old_name: "An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf"
        
        # 1) Split on underscores
        parts = old_name.split("_")
        # e.g. ["An","update","of","Lassa","fever","outbreak","in","Nigeria","041124","45.pdf"]
        
        if len(parts) < 9:
            # If the file name doesn't match the expected pattern, skip it
            print(f"Skipping file (unrecognized pattern): {old_name}")
            continue
        
        # 2) The date chunk is parts[8] like "041124"
        date_str = parts[8]  # "041124"
        
        # 3) The week chunk is in parts[9], but includes ".pdf" at the end, e.g. "45.pdf"
        week_str_pdf = parts[9]  # "45.pdf"
        # Remove ".pdf" from the end
        if week_str_pdf.endswith(".pdf"):
            week_str = week_str_pdf.replace(".pdf", "")
        else:
            print(f"Skipping file (no .pdf in last part): {old_name}")
            continue
        
        # 4) date_str should be 6 characters: DDMMYY
        if len(date_str) != 6:
            print(f"Skipping file (date string not 6 chars): {old_name}")
            continue
        dd = date_str[0:2]   # "04"
        mm = date_str[2:4]   # "11"
        yy = date_str[4:6]   # "24"
        
        # 5) Convert mm => month name
        month_name = month_map.get(mm, "???" )  # fallback "???"
        
        # 6) Build new name
        # e.g. "Nigeria_04_Nov_24_W45.pdf"
        new_name = f"Nigeria_{dd}_{month_name}_{yy}_W{week_str}.pdf"
        
        new_path = folder / new_name
        # 7) Rename the file
        print(f"Renaming:\n  {old_name}\n-> {new_name}\n")
        file_path.rename(new_path)

# Example usage:
if __name__ == "__main__":
    rename_lassa_files("PDFs")


# PDFs for 2024

In [5]:
import os
import re
all_pdfs = [f for f in os.listdir("PDFs") if f.endswith(".pdf")]
pdfs_2024 = [f for f in all_pdfs if "_24_W" in f]
#print("PDFs for 2024:", pdfs_2024)

sorted_pdfs = sorted(pdfs_2024, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))
print("Sorted PDFs for 2024:", sorted_pdfs)

Sorted PDFs for 2024: ['Nigeria_04_Jan_24_W1.pdf', 'Nigeria_11_Jan_24_W2.pdf', 'Nigeria_18_Jan_24_W3.pdf', 'Nigeria_25_Jan_24_W4.pdf', 'Nigeria_01_Feb_24_W5.pdf', 'Nigeria_08_Feb_24_W6.pdf', 'Nigeria_15_Feb_24_W7.pdf', 'Nigeria_22_Feb_24_W8.pdf', 'Nigeria_29_Feb_24_W9.pdf', 'Nigeria_07_Mar_24_W10.pdf', 'Nigeria_14_Mar_24_W11.pdf', 'Nigeria_21_Mar_24_W12.pdf', 'Nigeria_28_Mar_24_W13.pdf', 'Nigeria_04_Apr_24_W14.pdf', 'Nigeria_11_Apr_24_W15.pdf', 'Nigeria_18_Apr_24_W16.pdf', 'Nigeria_25_Apr_24_W17.pdf', 'Nigeria_02_May_24_W18.pdf', 'Nigeria_09_May_24_W19.pdf', 'Nigeria_16_May_24_W20.pdf', 'Nigeria_23_May_24_W21.pdf', 'Nigeria_30_May_24_W22.pdf', 'Nigeria_06_Jun_24_W23.pdf', 'Nigeria_13_Jun_24_W24.pdf', 'Nigeria_20_Jun_24_W25.pdf', 'Nigeria_27_Jun_24_W26.pdf', 'Nigeria_04_Jul_24_W27.pdf', 'Nigeria_11_Jul_24_W28.pdf', 'Nigeria_18_Jul_24_W29.pdf', 'Nigeria_25_Jul_24_W30.pdf', 'Nigeria_01_Aug_24_W31.pdf', 'Nigeria_08_Aug_24_W32.pdf', 'Nigeria_15_Aug_24_W33.pdf', 'Nigeria_22_Aug_24_W34.pdf', 

In [None]:
for pdf in sorted_pdfs:
    input_pdf = os.path.join("PDFs", pdf)
    output_img = os.path.join("PDFs_Lines", f"Lines_{pdf.replace('.pdf','')}_page3.png")
    enhance_table_lines_from_pdf_hq(input_pdf, output_img, tr1=700, linelength1=1700,linegap1=100, toler1 = 10 , page_number=3, dpi=300) #length 40 was decent

# Main script

In [None]:
import cv2
import numpy as np
from PIL import Image, ImageColor
import fitz  # PyMuPDF

def hex_to_hsv(hex_color):
    rgb = ImageColor.getcolor(hex_color, "RGB")
    r, g, b = [x / 255.0 for x in rgb]
    hsv = cv2.cvtColor(np.uint8([[[b * 255, g * 255, r * 255]]]), cv2.COLOR_BGR2HSV)[0][0]
    return hsv

def enhance_table_lines_from_pdf_hq(pdf_path, output_path,tr1, linelength1, linegap1,toler1, page_number=0, dpi=300):
    """
    Enhances vertical column separators and draws horizontal lines at
    top boundary, bottom boundary, and header bottom.

    Args:
        pdf_path (str): Path to the PDF.
        output_path (str): Path to save the image (use .png).
        page_number (int): Page to process (0-indexed).
        dpi (int): Resolution for rendering the PDF page.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number]

    # 1. Render the PDF page at high DPI
    pix = page.get_pixmap(dpi=dpi) 
    img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # Convert PIL Image to OpenCV BGR
    img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

    # 2. Convert to HSV to detect green rows
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    target_hsv = hex_to_hsv("#D8EDCF")
    tolerance = toler1
    lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50])
    upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255])
    green_mask = cv2.inRange(hsv, lower_green, upper_green)
    h_proj_green = np.sum(green_mask, axis=1)
    green_row_indices = np.where(h_proj_green > 50)[0]

    if len(green_row_indices) == 0:
        print("No green rows detected.")
        return

    top_boundary = green_row_indices[0]
    bottom_boundary = green_row_indices[-1]

    # 3. Header Row Detection (just above top_boundary)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    header_region = gray[:top_boundary, :]

    # Use Otsu’s threshold instead of fixed 180
    _, binary_header = cv2.threshold(header_region, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
    h_proj_header = np.sum(binary_header, axis=1)

    header_bottom = 0
    for i in range(len(h_proj_header) - 1, 0, -1):
        if h_proj_header[i] > 40:
            header_bottom = i
            break

    # 4. Adaptive Thresholding in the table region
    table_region = gray[top_boundary:bottom_boundary, :]
    thresh_table = cv2.adaptiveThreshold(
        table_region, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 3
    )

    # 5. Hough Lines to find vertical lines
    # Increase or decrease threshold=775 if you get too few/many lines
    lines = cv2.HoughLinesP(thresh_table, 1, np.pi / 180, threshold=tr1,
                            minLineLength=linelength1, maxLineGap=linegap1)
    vertical_lines = []
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            # Check near-vertical
            if abs(x2 - x1) < 2:
                # Adjust back to full image coordinates
                vertical_lines.append((x1, y1 + top_boundary, x2, y2 + top_boundary))

    # 6. Draw lines on the OpenCV image
    # A) Draw the vertical lines from header_bottom to bottom_boundary
    for x1, y1, x2, y2 in vertical_lines:
        cv2.line(img, (x1, 350), (x2, bottom_boundary-185), (100, 100, 100), 1)
   
    print(header_bottom)
    # B) Draw horizontal lines at:
    #    - top_boundary
    #    - bottom_boundary
    #    - header_bottom
    # We'll draw them across the full width of the image
    height, width = img.shape[:2]
    # color is (100,100,100); thickness=1 or 2 as you prefer
    # Top boundary (green)
    cv2.line(img, (0, top_boundary), (width, top_boundary), (0, 255, 0), 2)

    # Bottom boundary (green)
    cv2.line(img, (0, bottom_boundary), (width, bottom_boundary), (0, 255, 0), 2)

    # Header bottom (red)
    cv2.line(img, (0, header_bottom), (width, header_bottom), (0, 0, 255), 2)

    # 7. Convert back to PIL and save
    output_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    output_pil.save(output_path)

    print(f"Saved enhanced table to: {output_path}")


In [46]:
for week in [1,2,3,4,12,22,26,32,42,52]:
    enhance_table_lines_from_pdf_hq(f"PDFs/W{week}.pdf", f"Lines_W{week}_boundaries.png", page_number=3, dpi=300)

515
Saved enhanced table to: Lines_W1_boundaries.png
405
Saved enhanced table to: Lines_W2_boundaries.png
407
Saved enhanced table to: Lines_W3_boundaries.png
410
Saved enhanced table to: Lines_W4_boundaries.png
402
Saved enhanced table to: Lines_W12_boundaries.png
407
Saved enhanced table to: Lines_W22_boundaries.png
408
Saved enhanced table to: Lines_W26_boundaries.png
427
Saved enhanced table to: Lines_W32_boundaries.png
430
Saved enhanced table to: Lines_W42_boundaries.png
399
Saved enhanced table to: Lines_W52_boundaries.png


# Hiding text

This code works well now! 
Crops images to the bottom of the table avoding the legend and also places vertical lines at correct positions. 2 of the tables had issues with the green rows being detected but can address that later.

In [399]:
import cv2
import numpy as np
from PIL import Image, ImageColor
import fitz  # PyMuPDF

def hex_to_hsv(hex_color):
    rgb = ImageColor.getcolor(hex_color, "RGB")
    r, g, b = [x / 255.0 for x in rgb]
    hsv = cv2.cvtColor(
        np.uint8([[[b * 255, g * 255, r * 255]]]),
        cv2.COLOR_BGR2HSV
    )[0][0]
    return hsv

def enhance_table_lines_from_pdf_hq(
    pdf_path, 
    output_path,
    tr1, 
    linelength1, 
    linegap1, 
    toler1, 
    page_number=0, 
    dpi=600
):
    """
    Enhances vertical column separators and draws horizontal lines at
    top boundary, bottom boundary, and header bottom. Uses morphological
    filtering to remove small vertical text edges.

    Args:
        pdf_path (str): Path to the PDF.
        output_path (str): Path to save the image (e.g. .png).
        tr1 (int): HoughLinesP threshold.
        linelength1 (int): HoughLinesP minLineLength.
        linegap1 (int): HoughLinesP maxLineGap.
        toler1 (int): Tolerance around the HSV hue for detecting green rows.
        page_number (int): Which PDF page to process (0-indexed).
        dpi (int): Rendering DPI for the PDF page.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number]

    # 1. Render the PDF page at high DPI
    pix = page.get_pixmap(dpi=dpi)
    img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # Convert PIL Image to OpenCV BGR
    img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

    height1, width1 = img.shape[:2]
    total_pixels = height1 * width1
    print("Total pixels =", total_pixels)
    # 2. Convert to HSV & detect green rows
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    #target_hsv = hex_to_hsv("#D8EDCF")
    target_hsv = np.array([102, 12.66, 92.94], dtype=np.uint8)
    tolerance = toler1
    lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50])
    upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255])
    green_mask = cv2.inRange(hsv, lower_green, upper_green)
    h_proj_green = np.sum(green_mask, axis=1)
    green_row_indices = np.where(h_proj_green > 1000)[0]

    if len(green_row_indices) == 0:
        print("No green rows detected.")
        print(pdf_path)
        #return

    if len(green_row_indices) == 0:
        top_boundary = 800
        bottom_boundary = 4500
    else:
        top_boundary = green_row_indices[0] - 20
        bottom_boundary = green_row_indices[-1] + 20
    
    # 3. Header Row Detection (just above top_boundary)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    header_region = gray[:top_boundary, :]

    _, binary_header = cv2.threshold(
        header_region, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
    )
    h_proj_header = np.sum(binary_header, axis=1)

    header_bottom = 0
    for i in range(len(h_proj_header) - 1, 0, -1):
        if h_proj_header[i] > 40:
            header_bottom = i
            break

    # 4. Adaptive Thresholding in the table region
    table_region = gray[top_boundary:bottom_boundary, :]
    thresh_table = cv2.adaptiveThreshold(
        table_region,
        255,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY_INV,
        11,
        3
    )

    #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 40))
    #cleaned = cv2.morphologyEx(thresh_table, cv2.MORPH_OPEN, kernel)

    # 5. Hough Lines to find vertical lines (using the filtered 'cleaned' image)
    lines = cv2.HoughLinesP(
        thresh_table,
        1,
        np.pi / 180,
        threshold=tr1,
        minLineLength=linelength1,
        maxLineGap=linegap1
    )
    vertical_lines = []
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            if abs(x2 - x1) < 5:
                vertical_lines.append((x1, y1 + top_boundary, x2, y2 + top_boundary))

    # 6. Draw vertical lines on the image
    for x1, y1, x2, y2 in vertical_lines:
        cv2.line(img, (x1, top_boundary-20), (x2, bottom_boundary+20), (100, 100, 100), 2)

    #print(f"Header bottom: {top_boundary}")

    # Crop the image so that it ends at bottom_boundary + 40
    crop_bottom = bottom_boundary + 40
    crop_bottom = min(crop_bottom, img.shape[0])
    img_cropped = img[:crop_bottom, :]

    # 7. Convert back to PIL and save
    output_pil = Image.fromarray(cv2.cvtColor(img_cropped, cv2.COLOR_BGR2RGB))
    output_pil.save(output_path)

    #print(f"Saved enhanced table to: {output_path}")


In [400]:
pdf = "Nigeria_14_Oct_24_W42.pdf"
input_pdf = os.path.join("PDFs", pdf)
output_img = os.path.join("PDFs_Lines", f"Lines_{pdf.replace('.pdf','')}_page3.png")
enhance_table_lines_from_pdf_hq(input_pdf, output_img, tr1=1000, linelength1=70,linegap1=70, toler1 = 10, page_number=3, dpi=600) #length 40 

Total pixels = 34799360


In [403]:
for pdf in sorted_pdfs:
    input_pdf = os.path.join("PDFs", pdf)
    output_img = os.path.join("PDFs_Lines", f"Lines_{pdf.replace('.pdf','')}_page3.png")
    enhance_table_lines_from_pdf_hq(input_pdf, output_img, tr1=1400, linelength1=79,linegap1=50, toler1 = 10, page_number=3, dpi=600) #length 40 was decent

    # For DPI 300 tr1 = 850 was good, linelength = 1700, linegap = 100, toler =  10
    # But need to find better one for DPI 600

Total pixels = 34806376
Total pixels = 34757631
Total pixels = 34757631
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34799360
Total pixels = 34799360
Total pixels = 34806376
Total pixels = 34757631
Total pixels = 34799360
Total pixels = 34799360
Total pixels = 34799360
Total pixels = 34799360
Total pixels = 34799360
Total pixels = 34799360
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34757631
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34757631
Total pixels = 34757631
Total pixels = 34757631
Total pixels = 34757631
Total pixels = 34757631
Total pixels = 34757631
Total pixels = 34806376
No green rows detected.
PDFs/Nigeria_01_Aug_24_W31.pdf
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pixels = 34806376
Total pix

# 15% left

In [19]:
import cv2
import numpy as np
from PIL import Image, ImageColor
import fitz  # PyMuPDF

def hex_to_hsv(hex_color):
    rgb = ImageColor.getcolor(hex_color, "RGB")
    r, g, b = [x / 255.0 for x in rgb]
    hsv = cv2.cvtColor(
        np.uint8([[[b * 255, g * 255, r * 255]]]),
        cv2.COLOR_BGR2HSV
    )[0][0]
    return hsv

def enhance_table_lines_from_pdf_hq(pdf_path, output_path, page_number=0, dpi=300):
    """
    Enhances vertical column separators and draws horizontal lines at
    top boundary, bottom boundary, and header bottom.

    Args:
        pdf_path (str): Path to the PDF.
        output_path (str): Path to save the image (use .png).
        page_number (int): Page to process (0-indexed).
        dpi (int): Resolution for rendering the PDF page.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number]

    # 1. Render the PDF page at high DPI
    pix = page.get_pixmap(dpi=dpi) 
    img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # Convert PIL Image to OpenCV BGR
    img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

    # 2. Convert to HSV & detect green rows
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    height, width = img.shape[:2]
    
    # We'll only check the left 15% of the page's width
    left_15_width = int(width * 0.15)

    target_hsv = hex_to_hsv("#D8EDCF")
    tolerance = 15
    lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50])
    upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255])
    green_mask = cv2.inRange(hsv, lower_green, upper_green)

    # Sum only the left 15% columns
    green_mask_left = green_mask[:, :left_15_width]
    h_proj_green = np.sum(green_mask_left, axis=1)
    green_row_indices = np.where(h_proj_green > 0)[0]

    if len(green_row_indices) == 0:
        print("No green rows detected.")
        return

    top_boundary = green_row_indices[0]
    bottom_boundary = green_row_indices[-1]

    # 3. Header Row Detection (just above top_boundary)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    header_region = gray[:top_boundary, :]

    # Use Otsu’s threshold for the header
    _, binary_header = cv2.threshold(
        header_region, 0, 255,
        cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
    )
    h_proj_header = np.sum(binary_header, axis=1)

    header_bottom = 0
    for i in range(len(h_proj_header) - 1, 0, -1):
        if h_proj_header[i] > 40:
            header_bottom = i
            break

    # 4. Adaptive Thresholding in the table region
    table_region = gray[top_boundary:bottom_boundary, :]
    thresh_table = cv2.adaptiveThreshold(
        table_region,
        255,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY_INV,
        11,
        3
    )

    # 5. Hough Lines to find vertical lines
    lines = cv2.HoughLinesP(thresh_table, 1, np.pi / 180,
                            threshold=775, minLineLength=10, maxLineGap=8)
    vertical_lines = []
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            if abs(x2 - x1) < 5:  # near-vertical
                # Adjust back to full image coordinates
                vertical_lines.append((x1, y1 + top_boundary, x2, y2 + top_boundary))

    # 6. Draw lines on the OpenCV image
    # A) Draw the vertical lines from header_bottom to bottom_boundary
    for x1, y1, x2, y2 in vertical_lines:
        cv2.line(img, (x1, header_bottom), (x2, bottom_boundary), (100, 100, 100), 1)

    # B) Draw horizontal lines at top, bottom, and header_bottom
    cv2.line(img, (0, top_boundary), (width, top_boundary), (0, 255, 0), 2)     # green
    cv2.line(img, (0, bottom_boundary), (width, bottom_boundary), (0, 255, 0), 2) # green
    cv2.line(img, (0, header_bottom), (width, header_bottom), (0, 0, 255), 2)  # red

    # 7. Convert back to PIL and save
    output_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    output_pil.save(output_path)

    print(f"Saved enhanced table to: {output_path}")


In [20]:
for week in [1,2,3,4,12,22,26,32,42,52]:
    enhance_table_lines_from_pdf_hq(f"PDFs/W{week}.pdf", f"Lines_W{week}_boundaries.png", page_number=3, dpi=300)

Saved enhanced table to: Lines_W1_boundaries.png
Saved enhanced table to: Lines_W2_boundaries.png
Saved enhanced table to: Lines_W3_boundaries.png
Saved enhanced table to: Lines_W4_boundaries.png
Saved enhanced table to: Lines_W12_boundaries.png
Saved enhanced table to: Lines_W22_boundaries.png
Saved enhanced table to: Lines_W26_boundaries.png
Saved enhanced table to: Lines_W32_boundaries.png
Saved enhanced table to: Lines_W42_boundaries.png
Saved enhanced table to: Lines_W52_boundaries.png


In [None]:
import os
import re
from pathlib import Path

def rename_lassa_files(folder_path):
    """
    Renames 'An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf'
    to 'Nigeria_04_Nov_24_W45.pdf', extracting day=04, month=11 => 'Nov', year=24,
    and the week number 45.
    
    Args:
        folder_path (str): Path to the folder that contains the PDF files.
    """
    # For mapping month number to short name
    month_map = {
        "01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
        "05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
        "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec",
    }

    folder = Path(folder_path)
    for file_path in folder.iterdir():
        if not file_path.is_file():
            continue
        if not file_path.suffix.lower() == ".pdf":
            continue
        
        old_name = file_path.name
        # Example old_name: "An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf"
        
        # 1) Split on underscores
        parts = old_name.split("_")
        # e.g. ["An","update","of","Lassa","fever","outbreak","in","Nigeria","041124","45.pdf"]
        
        if len(parts) < 9:
            # If the file name doesn't match the expected pattern, skip it
            print(f"Skipping file (unrecognized pattern): {old_name}")
            continue
        
        # 2) The date chunk is parts[8] like "041124"
        date_str = parts[8]  # "041124"
        
        # 3) The week chunk is in parts[9], but includes ".pdf" at the end, e.g. "45.pdf"
        week_str_pdf = parts[9]  # "45.pdf"
        # Remove ".pdf" from the end
        if week_str_pdf.endswith(".pdf"):
            week_str = week_str_pdf.replace(".pdf", "")
        else:
            print(f"Skipping file (no .pdf in last part): {old_name}")
            continue
        
        # 4) date_str should be 6 characters: DDMMYY
        if len(date_str) != 6:
            print(f"Skipping file (date string not 6 chars): {old_name}")
            continue
        dd = date_str[0:2]   # "04"
        mm = date_str[2:4]   # "11"
        yy = date_str[4:6]   # "24"
        
        # 5) Convert mm => month name
        month_name = month_map.get(mm, "???" )  # fallback "???"
        
        # 6) Build new name
        # e.g. "Nigeria_04_Nov_24_W45.pdf"
        new_name = f"Nigeria_{dd}_{month_name}_{yy}_W{week_str}.pdf"
        
        new_path = folder / new_name
        # 7) Rename the file
        print(f"Renaming:\n  {old_name}\n-> {new_name}\n")
        file_path.rename(new_path)

# Example usage:
if __name__ == "__main__":
    rename_lassa_files("PDFs")
