# PDF Sourcing

In [1]:
pip install google-cloud-vision pandas

Collecting google-cloud-vision
  Downloading google_cloud_vision-3.10.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-cloud-vision)
  Downloading google_api_core-2.24.1-py3-none-any.whl.metadata (3.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-cloud-vision)
  Downloading proto_plus-1.26.0-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-cloud-vision)
  Downloading googleapis_common_protos-1.69.0-py2.py3-non

In [None]:
!pip install requests beautifulsoup4 pytesseract

In [None]:
!pip install google-generativeai python-dotenv Pillow pydantic


In [None]:
import os
import requests
from bs4 import BeautifulSoup

def download_lassa_pdfs():
    # The base URL that hosts the PDFs
    base_url = "https://ncdc.gov.ng"
    # The specific page that lists all the Lassa fever situation reports
    list_page_url = (
        "https://ncdc.gov.ng/diseases/sitreps/?cat=5&name=An%20update%20of%20Lassa%20fever%20outbreak%20in%20Nigeria"
    )
    
    # Create a local folder to store the PDFs
    os.makedirs("pdfs", exist_ok=True)
    
    # 1. Fetch the HTML
    print(f"Fetching list page: {list_page_url}")
    response = requests.get(list_page_url)
    response.raise_for_status()  # raise an error if the HTTP request failed
    
    # 2. Parse the HTML
    soup = BeautifulSoup(response.text, "html.parser")
    
    # The table is inside <tbody>. Each row has multiple <td>, 
    # and the third <td> has the <a> with the PDF link
    table_body = soup.find("tbody")
    if not table_body:
        print("Could not find <tbody> on the page.")
        return
    
    rows = table_body.find_all("tr")
    if not rows:
        print("No <tr> found inside <tbody>.")
        return

    # For stats
    total_found = 0
    total_downloaded = 0
    
    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 3:
            # We expect 3 <td> in each row: (1) index, (2) description, (3) the PDF link
            continue
        
        # The PDF link is in the third cell; let's get the <a>:
        link_tag = cells[2].find("a", href=True)
        if not link_tag:
            continue
        
        # The PDF URL is relative, e.g. "/themes/common/files/sitreps/..."
        # We need to prepend https://ncdc.gov.ng
        pdf_url = link_tag["href"]
        if pdf_url.startswith("/"):
            pdf_url = base_url + pdf_url
        
        # The "download" attribute often has the suggested filename
        # or we can parse from the final part of the URL
        download_name = link_tag.get("download")  # e.g. "An update of Lassa fever ... .pdf"
        
        if not download_name:
            # Fallback: parse the filename from the URL
            download_name = pdf_url.split("/")[-1]

        # Clean up the download name if needed
        download_name = download_name.replace(" ", "_")

        total_found += 1
        # 3. Download the PDF
        # We'll skip if it already exists. Or you can overwrite by removing the check.
        local_path = os.path.join("pdfs", download_name)
        if os.path.exists(local_path):
            print(f"Already downloaded: {download_name}")
            continue
        
        print(f"Downloading {pdf_url} -> {local_path}")
        try:
            pdf_response = requests.get(pdf_url)
            pdf_response.raise_for_status()
            with open(local_path, "wb") as f:
                f.write(pdf_response.content)
            total_downloaded += 1
        except Exception as e:
            print(f"Failed to download {pdf_url}: {e}")

    print(f"Found {total_found} PDF links total. Downloaded {total_downloaded} new PDFs.")

if __name__ == "__main__":
    download_lassa_pdfs()


## Downloading all PDFs from all diseases

In [67]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Base URL of the directory listing (adjust this to your actual URL)
base_url = "https://ncdc.gov.ng/themes/common/files/sitreps/"

# Folder to save the downloaded PDFs
download_folder = "ALL_SitReps"
os.makedirs(download_folder, exist_ok=True)

# Fetch the HTML content from the base URL
try:
    response = requests.get(base_url)
    response.raise_for_status()
except requests.RequestException as e:
    print(f"Error fetching {base_url}: {e}")
    exit(1)

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find all links ending with '.pdf'
pdf_links = [
    link.get("href")
    for link in soup.find_all("a")
    if link.get("href") and link.get("href").lower().endswith(".pdf")
]

print(f"Found {len(pdf_links)} PDF files.")

# Download each PDF file
for pdf_link in pdf_links:
    # Construct the absolute URL in case the href is a relative link.
    pdf_url = urljoin(base_url, pdf_link)
    file_name = os.path.basename(pdf_link)
    file_path = os.path.join(download_folder, file_name)
    print(f"Downloading: {pdf_url}")
    try:
        pdf_response = requests.get(pdf_url)
        pdf_response.raise_for_status()
        with open(file_path, "wb") as f:
            f.write(pdf_response.content)
        print(f"Saved: {file_path}")
    except requests.RequestException as e:
        print(f"Error downloading {pdf_url}: {e}")


Found 1246 PDF files.
Downloading: https://ncdc.gov.ng/themes/common/files/sitreps/0a0ff32d3952b825be885e945355f25f.pdf
Saved: ALL_SitReps/0a0ff32d3952b825be885e945355f25f.pdf
Downloading: https://ncdc.gov.ng/themes/common/files/sitreps/0a4c89ff97671ba0f363cd1edbec077b.pdf
Saved: ALL_SitReps/0a4c89ff97671ba0f363cd1edbec077b.pdf
Downloading: https://ncdc.gov.ng/themes/common/files/sitreps/0a5b79cb766a64f93a8f2e2738b29656.pdf
Saved: ALL_SitReps/0a5b79cb766a64f93a8f2e2738b29656.pdf
Downloading: https://ncdc.gov.ng/themes/common/files/sitreps/0a22db7d2c626de19d5becffc24d0293.pdf
Saved: ALL_SitReps/0a22db7d2c626de19d5becffc24d0293.pdf
Downloading: https://ncdc.gov.ng/themes/common/files/sitreps/0a61bf47e3498906525df7c52d34c371.pdf
Saved: ALL_SitReps/0a61bf47e3498906525df7c52d34c371.pdf
Downloading: https://ncdc.gov.ng/themes/common/files/sitreps/0aa015568b0325ccac29573ecf89b8fd.pdf
Saved: ALL_SitReps/0aa015568b0325ccac29573ecf89b8fd.pdf
Downloading: https://ncdc.gov.ng/themes/common/files/s

In [None]:
import os

pdf_files = os.listdir('PDFs')
print(pdf_files)

In [117]:
import os
import re
from pathlib import Path

def rename_lassa_files(folder_path):
    """
    Renames 'An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf'
    to 'Nigeria_04_Nov_24_W45.pdf', extracting day=04, month=11 => 'Nov', year=24,
    and the week number 45.
    
    Args:
        folder_path (str): Path to the folder that contains the PDF files.
    """
    # For mapping month number to short name
    month_map = {
        "01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
        "05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
        "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec",
    }

    folder = Path(folder_path)
    for file_path in folder.iterdir():
        if not file_path.is_file():
            continue
        if not file_path.suffix.lower() == ".pdf":
            continue
        
        old_name = file_path.name
        # Replace spaces with underscores
        old_name = old_name.replace(" ", "_")
        # Example old_name: "An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf"
        
        # 1) Split on underscores
        parts = old_name.split("_")
        # e.g. ["An","update","of","Lassa","fever","outbreak","in","Nigeria","041124","45.pdf"]
        
        if len(parts) < 9:
            # If the file name doesn't match the expected pattern, skip it
            print(f"Skipping file (unrecognized pattern): {old_name}")
            continue
        
        # 2) The date chunk is parts[8] like "041124"
        date_str = parts[8]  # "041124"
        
        # 3) The week chunk is in parts[9], but includes ".pdf" at the end, e.g. "45.pdf"
        week_str_pdf = parts[9]  # "45.pdf"
        # Remove ".pdf" from the end
        if week_str_pdf.endswith(".pdf"):
            week_str = week_str_pdf.replace(".pdf", "")
        else:
            print(f"Skipping file (no .pdf in last part): {old_name}")
            continue
        
        # 4) date_str should be 6 characters: DDMMYY
        if len(date_str) != 6:
            print(f"Skipping file (date string not 6 chars): {old_name}")
            continue
        dd = date_str[0:2]   # "04"
        mm = date_str[2:4]   # "11"
        yy = date_str[4:6]   # "24"
        
        # 5) Convert mm => month name
        month_name = month_map.get(mm, "???" )  # fallback "???"
        
        # 6) Build new name
        # e.g. "Nigeria_04_Nov_24_W45.pdf"
        new_name = f"Nigeria_{dd}_{month_name}_{yy}_W{week_str}.pdf"
        
        new_path = folder / new_name
        # 7) Rename the file
        print(f"Renaming:\n  {old_name}\n-> {new_name}\n")
        file_path.rename(new_path)

# Example usage:
if __name__ == "__main__":
    rename_lassa_files("downloaded")


Renaming:
  An_update_of_Lassa_fever_outbreak_in_Nigeria_210923_39.pdf
-> Nigeria_21_Sep_23_W39.pdf

Renaming:
  An_update_of_Lassa_fever_outbreak_in_Nigeria_020123_1.pdf
-> Nigeria_02_Jan_23_W1.pdf

Renaming:
  An_update_of_Lassa_fever_outbreak_in_Nigeria_050123_2.pdf
-> Nigeria_05_Jan_23_W2.pdf

Renaming:
  An_update_of_Lassa_fever_outbreak_in_Nigeria_140923_38.pdf
-> Nigeria_14_Sep_23_W38.pdf



## PDFs for 2024

In [None]:
import os
import re
all_pdfs = [f for f in os.listdir("PDFs") if f.endswith(".pdf")]
pdfs_2024 = [f for f in all_pdfs if "_24_W" in f]
#print("PDFs for 2024:", pdfs_2024)

sorted_pdfs = sorted(pdfs_2024, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))

print("Sorted PDFs for 2024:", sorted_pdfs)
print("Total PDFs for 2024:", len(sorted_pdfs))

## PDFs for 2023

In [5]:
import os
import re
all_pdfs = [f for f in os.listdir("PDFs") if f.endswith(".pdf")]
pdfs_2023 = [f for f in all_pdfs if "_23_W" in f]

sorted_pdfs = sorted(pdfs_2023, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))

print("Sorted PDFs for 2023:", sorted_pdfs)
print("Number of PDFs for 2023:", len(sorted_pdfs))
# Calculate missing week numbers
all_weeks = set(range(1, 53))  # Weeks 1-52
existing_week_numbers = set(int(re.search(r'_W(\d+)\.pdf$', pdf).group(1)) for pdf in pdfs_2023)
missing_weeks = sorted(all_weeks - existing_week_numbers)

print("\nMissing weeks for 2023:", missing_weeks)
print("Number of missing weeks:", len(missing_weeks))


Sorted PDFs for 2023: ['Nigeria_02_Jan_23_W1.pdf', 'Nigeria_05_Jan_23_W2.pdf', 'Nigeria_12_Jan_23_W3.pdf', 'Nigeria_19_Jan_23_W4.pdf', 'Nigeria_26_Jan_23_W5.pdf', 'Nigeria_02_Feb_23_W6.pdf', 'Nigeria_09_Feb_23_W7.pdf', 'Nigeria_16_Feb_23_W8.pdf', 'Nigeria_23_Feb_23_W9.pdf', 'Nigeria_02_Mar_23_W10.pdf', 'Nigeria_09_Mar_23_W11.pdf', 'Nigeria_16_Mar_23_W12.pdf', 'Nigeria_23_Mar_23_W13.pdf', 'Nigeria_30_Mar_23_W14.pdf', 'Nigeria_06_Apr_23_W15.pdf', 'Nigeria_13_Apr_23_W16.pdf', 'Nigeria_17_Apr_23_W17.pdf', 'Nigeria_24_Apr_23_W18.pdf', 'Nigeria_04_May_23_W19.pdf', 'Nigeria_11_May_23_W20.pdf', 'Nigeria_18_May_23_W21.pdf', 'Nigeria_25_May_23_W22.pdf', 'Nigeria_01_Jun_23_W23.pdf', 'Nigeria_08_Jun_23_W24.pdf', 'Nigeria_15_Jun_23_W25.pdf', 'Nigeria_22_Jun_23_W26.pdf', 'Nigeria_29_Jun_23_W27.pdf', 'Nigeria_06_Jul_23_W28.pdf', 'Nigeria_13_Jul_23_W29.pdf', 'Nigeria_20_Jul_23_W30.pdf', 'Nigeria_27_Jul_23_W31.pdf', 'Nigeria_03_Aug_23_W32.pdf', 'Nigeria_10_Aug_23_W33.pdf', 'Nigeria_17_Aug_23_W34.pdf', 

## PDFs for 2022

In [21]:
import os
import re
from pathlib import Path

# Option 1: Use absolute path
BASE_DIR = Path('/Users/arturtrebski/Documents/Lassa_Reports_Scraping')

# Option 2: Use relative path from notebook location
# BASE_DIR = Path.cwd().parent  # if notebook is in a notebooks subdirectory

#FOLDER = BASE_DIR / 'data' / 'raw' / 'year' / '2021'
FOLDER = BASE_DIR / 'data' / 'raw' / 'yearly_pdfs' / 'PDFs_2021'

# Ensure directory exists
FOLDER.mkdir(parents=True, exist_ok=True)

all_pdfs = [f for f in os.listdir(FOLDER) if f.endswith(".pdf")]
pdfs = [f for f in all_pdfs if "_21_W" in f]

sorted_pdfs = sorted(pdfs, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))

print("Sorted PDFs for year:", sorted_pdfs)
print("Number of PDFs for year:", len(sorted_pdfs))

# Calculate missing week numbers
all_weeks = set(range(1, 53))  # Weeks 1-52
existing_week_numbers = set(int(re.search(r'_W(\d+)\.pdf$', pdf).group(1)) for pdf in pdfs)
missing_weeks = sorted(all_weeks - existing_week_numbers)

print("\nMissing weeks for 2024:", missing_weeks)
print("Number of missing weeks:", len(missing_weeks))

Sorted PDFs for year: ['Nigeria_01_Jan_21_W1.pdf', 'Nigeria_08_Jan_21_W2.pdf', 'Nigeria_15_Jan_21_W3.pdf', 'Nigeria_22_Jan_21_W4.pdf', 'Nigeria_29_Jan_21_W5.pdf', 'Nigeria_05_Feb_21_W6.pdf', 'Nigeria_12_Feb_21_W7.pdf', 'Nigeria_19_Feb_21_W8.pdf', 'Nigeria_26_Feb_21_W9.pdf', 'Nigeria_05_Mar_21_W10.pdf', 'Nigeria_12_Mar_21_W11.pdf', 'Nigeria_19_Mar_21_W12.pdf', 'Nigeria_26_Mar_21_W13.pdf', 'Nigeria_02_Apr_21_W14.pdf', 'Nigeria_09_Apr_21_W15.pdf', 'Nigeria_16_Apr_21_W16.pdf', 'Nigeria_23_Apr_21_W17.pdf', 'Nigeria_30_Apr_21_W18.pdf', 'Nigeria_07_May_21_W19.pdf', 'Nigeria_14_May_21_W20.pdf', 'Nigeria_28_May_21_W22.pdf', 'Nigeria_18_Jun_21_W25.pdf', 'Nigeria_25_Jun_21_W26.pdf', 'Nigeria_02_Jul_21_W27.pdf', 'Nigeria_09_Jul_21_W28.pdf', 'Nigeria_16_Jul_21_W29.pdf', 'Nigeria_30_Jul_21_W31.pdf', 'Nigeria_13_Aug_21_W33.pdf', 'Nigeria_20_Aug_21_W34.pdf', 'Nigeria_27_Aug_21_W35.pdf', 'Nigeria_03_Sep_21_W36.pdf', 'Nigeria_10_Sep_21_W37.pdf', 'Nigeria_17_Sep_21_W38.pdf', 'Nigeria_24_Sep_21_W39.pdf', 

In [22]:
import csv

data = [
    ['Year', 'Week', 'Status', 'Notes'],
    ['2024', 'W16', 'Missing', 'Duplicate of W17'],
    ['2023', 'W1', 'Found', 'Website link was wrong'],
    ['2023', 'W38', 'Found', 'Website link was wrong'],
    ['2022', 'W21', 'Missing', 'Not found'],
    ['2022', 'W22', 'Found', 'Initially missing but found'],
    ['2022', 'W2', 'Corrupted', 'Need removal'],
    ['2022', 'W14', 'Corrupted', 'Need removal'],
    ['2022', 'W19', 'Corrupted', 'Need removal'],
    ['2022', 'W20', 'Corrupted', 'Need removal'],
    ['2022', 'W23', 'Corrupted', 'Need removal'],
    ['2021', 'W24', 'Missing', 'Not found'],
    ['2021', 'W21', 'Corrupted', 'Need removal'],
    ['2021', 'W23', 'Corrupted', 'Need removal'],
    ['2021', 'W30', 'Corrupted', 'Need removal'],
    ['2021', 'W32', 'Corrupted', 'Need removal']
]

file_name = 'file_status.csv'

# Option 1: Use absolute path
BASE_DIR = Path('/Users/arturtrebski/Documents/Lassa_Reports_Scraping')

#FOLDER = BASE_DIR / 'data' / 'raw' / 'year' / '2021'
FOLDER = BASE_DIR / 'data' / 'documentation'

output_file = FOLDER / file_name

with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(data)

print(f"Created {output_file} with {len(data)-1} entries")

Created /Users/arturtrebski/Documents/Lassa_Reports_Scraping/data/documentation/file_status.csv with 15 entries


In [None]:
import os
import re
all_pdfs = [f for f in os.listdir("PDFs") if f.endswith(".pdf")]
pdfs_2022 = [f for f in all_pdfs if "_22_W" in f]

sorted_pdfs = sorted(pdfs_2022, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))

print("Sorted PDFs for 2022:", sorted_pdfs)
print("Number of PDFs for 2022:", len(sorted_pdfs))
# Calculate missing week numbers
all_weeks = set(range(1, 53))  # Weeks 1-52
existing_week_numbers = set(int(re.search(r'_W(\d+)\.pdf$', pdf).group(1)) for pdf in pdfs_2022)
missing_weeks = sorted(all_weeks - existing_week_numbers)

print("\nMissing weeks for 2022:", missing_weeks)
print("Number of missing weeks:", len(missing_weeks))

# Create the target directory if it doesn't exist
os.makedirs("2022/PDFs_2022", exist_ok=True)

# Copy each 2022 PDF to the target directory
for pdf in sorted_pdfs:
    src_path = os.path.join("PDFs", pdf)
    dst_path = os.path.join("2022/PDFs_2022", pdf)
    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)
        print(f"Copied {pdf} to 2022/PDFs_2022/")
    else:
        print(f"Source file not found: {src_path}")

## PDFs for 2021


In [68]:
import os
import re
all_pdfs = [f for f in os.listdir("PDFs") if f.endswith(".pdf")]
pdfs_2021 = [f for f in all_pdfs if "_21_W" in f]

sorted_pdfs = sorted(pdfs_2021, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))

print("Sorted PDFs for 2021:", sorted_pdfs)
print("Number of PDFs for 2021:", len(sorted_pdfs))
# Calculate missing week numbers
all_weeks = set(range(1, 53))  # Weeks 1-52
existing_week_numbers = set(int(re.search(r'_W(\d+)\.pdf$', pdf).group(1)) for pdf in pdfs_2021)
missing_weeks = sorted(all_weeks - existing_week_numbers)

print("\nMissing weeks for 2021:", missing_weeks)
print("Number of missing weeks:", len(missing_weeks))

# Create the target directory if it doesn't exist
os.makedirs("2021/PDFs_2021", exist_ok=True)

# Copy each 2021 PDF to the target directory
for pdf in sorted_pdfs:
    src_path = os.path.join("PDFs", pdf)
    dst_path = os.path.join("2021/PDFs_2021", pdf)
    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)
        print(f"Copied {pdf} to 2021/PDFs_2021/")
    else:
        print(f"Source file not found: {src_path}")

Sorted PDFs for 2021: ['Nigeria_01_Jan_21_W1.pdf', 'Nigeria_08_Jan_21_W2.pdf', 'Nigeria_15_Jan_21_W3.pdf', 'Nigeria_22_Jan_21_W4.pdf', 'Nigeria_29_Jan_21_W5.pdf', 'Nigeria_05_Feb_21_W6.pdf', 'Nigeria_12_Feb_21_W7.pdf', 'Nigeria_19_Feb_21_W8.pdf', 'Nigeria_26_Feb_21_W9.pdf', 'Nigeria_05_Mar_21_W10.pdf', 'Nigeria_12_Mar_21_W11.pdf', 'Nigeria_19_Mar_21_W12.pdf', 'Nigeria_26_Mar_21_W13.pdf', 'Nigeria_02_Apr_21_W14.pdf', 'Nigeria_09_Apr_21_W15.pdf', 'Nigeria_16_Apr_21_W16.pdf', 'Nigeria_23_Apr_21_W17.pdf', 'Nigeria_30_Apr_21_W18.pdf', 'Nigeria_07_May_21_W19.pdf', 'Nigeria_14_May_21_W20.pdf', 'Nigeria_21_May_21_W21.pdf', 'Nigeria_28_May_21_W22.pdf', 'Nigeria_04_Jun_21_W23.pdf', 'Nigeria_18_Jun_21_W25.pdf', 'Nigeria_25_Jun_21_W26.pdf', 'Nigeria_02_Jul_21_W27.pdf', 'Nigeria_09_Jul_21_W28.pdf', 'Nigeria_16_Jul_21_W29.pdf', 'Nigeria_23_Jul_21_W30.pdf', 'Nigeria_30_Jul_21_W31.pdf', 'Nigeria_06_Aug_21_W32.pdf', 'Nigeria_13_Aug_21_W33.pdf', 'Nigeria_20_Aug_21_W34.pdf', 'Nigeria_27_Aug_21_W35.pdf', 

# Main script

This code works well now! 
Crops images to the bottom of the table avoding the legend and also places vertical lines at correct positions. 2 of the tables had issues with the green rows being detected but can address that later.

### Testing code 

In [None]:
pdf = sorted_pdfs[1]
input_pdf = os.path.join("PDFs_2023", pdf)
output_path = os.path.join("PDFs_Lines_2023", f"Lines_{pdf.replace('.pdf','')}_page3.png")
pdf_path = input_pdf
h1 = 40
s1 = 0
v1 = 210
h2 = 50
s2 = 30
v2 = 255
tr1 = 1400
linelength1 = 79
linegap1 = 50
toler1 = 10
page_number = 3
dpi = 600

doc = fitz.open(pdf_path)
page = doc[page_number]

# 1. Render the PDF page at high DPI
pix = page.get_pixmap(dpi=dpi)
img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Convert PIL Image to OpenCV BGR
img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

height1, width1 = img.shape[:2]
total_pixels = height1 * width1
print("Total pixels =", total_pixels)
print("Width =", width1)

# 2. Convert to HSV & detect green rows
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
target_hsv = np.array([102, 12.66, 92.94], dtype=np.uint8)
tolerance = toler1
lower_green = np.array([h1,  s1,  v1], dtype=np.uint8)  # loosen Hue to ~35–100
upper_green = np.array([h2, s2, v2], dtype=np.uint8)  # allow low Saturation up to 80

#lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50]) #original approach
#upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255]) #original approach
green_mask = cv2.inRange(hsv, lower_green, upper_green)
green_mask_pil = Image.fromarray(green_mask)
green_mask_pil.save("debug_green_mask.png")

overlay = img.copy()
# Paint those masked pixels bright green in overlay
overlay[green_mask > 0] = [0, 255, 0]
# Blend overlay with original (alpha blending)
alpha = 0.35
overlayed_img = cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0)
overlayed_pil = Image.fromarray(cv2.cvtColor(overlayed_img, cv2.COLOR_BGR2RGB))
overlayed_pil.save("debug_green_overlay.png")
print("Saved overlay image as debug_green_overlay.png")


h_proj_green = np.sum(green_mask, axis=1)
print(h_proj_green)
# Find indices where h_proj_green > 0
non_zero_indices = np.where(h_proj_green > 0)[0]
# Print just the non-zero values
print("Non-zero h_proj_green values:")
#for i in non_zero_indices:
    #print(f"Index={i}, Value={h_proj_green[i]}")        
green_row_indices = np.where(h_proj_green > 500000)[0]  # Original value is 1000 
print(green_row_indices)
if len(green_row_indices) == 0:
    print("No green rows detected.")
    print(pdf_path)
    #return

if len(green_row_indices) == 0:
    top_boundary = 800
    bottom_boundary = 4500
else:
    top_boundary = green_row_indices[1]
    bottom_boundary = green_row_indices[-1] 

# 3. Header Row Detection (just above top_boundary)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
header_region = gray[:top_boundary, :]

_, binary_header = cv2.threshold(
    header_region, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
)
h_proj_header = np.sum(binary_header, axis=1)

header_bottom = 0
for i in range(len(h_proj_header) - 1, 0, -1):
    if h_proj_header[i] > 40:
        header_bottom = i
        break

# 4. Adaptive Thresholding in the table region
table_region = gray[top_boundary:bottom_boundary, :]
thresh_table = cv2.adaptiveThreshold(
    table_region,
    255,
    cv2.ADAPTIVE_THRESH_MEAN_C,
    cv2.THRESH_BINARY_INV,
    11,
    3
)

#kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 40))
#cleaned = cv2.morphologyEx(thresh_table, cv2.MORPH_OPEN, kernel)

# 5. Hough Lines to find vertical lines (using the filtered 'cleaned' image)
lines = cv2.HoughLinesP(
    thresh_table,
    1,
    np.pi / 180,
    threshold=tr1,
    minLineLength=linelength1,
    maxLineGap=linegap1
)
vertical_lines = []
if lines is not None:
    for line in lines:
        x1, y1, x2, y2 = line[0]
        if abs(x2 - x1) < 5:
            vertical_lines.append((x1, top_boundary, x2, top_boundary))

# 6. Draw vertical lines on the image
for x1, y1, x2, y2 in vertical_lines:
    cv2.line(img, (x1, top_boundary), (x2, bottom_boundary+20), (100, 100, 100), 2)


# Top boundary (green)
height, width = img.shape[:2]
cv2.line(img, (0, top_boundary), (width, top_boundary), (0, 255, 0), 2)

# Bottom boundary (green)
cv2.line(img, (0, bottom_boundary), (width, bottom_boundary), (0, 255, 0), 2)
# Crop the image so that it ends at bottom_boundary + 40
crop_bottom = bottom_boundary + 40
crop_bottom = min(crop_bottom, img.shape[0])
new_width = int(img.shape[1] * 0.58)  # keep left 58% of the image
img_cropped = img[:crop_bottom, :new_width]

# 7. Convert back to PIL and save
output_pil = Image.fromarray(cv2.cvtColor(img_cropped, cv2.COLOR_BGR2RGB))
output_pil.save(output_path)

### Adding lines to PDFs 2023

In [None]:
all_pdfs = [f for f in os.listdir("PDFs") if f.endswith(".pdf")]
pdfs_2023 = [f for f in all_pdfs if "_23_W" in f]

sorted_pdfs = sorted(pdfs_2023, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))

# Limit to the top 3 sorted PDFs if needed
# sorted_pdfs = sorted_pdfs[0:3]

for pdf in sorted_pdfs:
    input_pdf = os.path.join("PDFs_2023", pdf)
    output_img = os.path.join("PDFs_Lines_2023", f"Lines_{pdf.replace('.pdf','')}_page3.png")
    enhance_table_lines_from_pdf_hq(input_pdf,
                                    output_img,
                                    h1=40, s1=0, v1=210,
                                    h2=50, s2=30, v2=255,
                                    tr1=1400,
                                    linelength1=79,
                                    linegap1=50,
                                    toler1 = 10,
                                    page_number=3,
                                    dpi=600) 

### Adding lines to PDFs 2024

In [62]:
import shutil

all_pdfs = [f for f in os.listdir("PDFs") if f.endswith(".pdf")]
pdfs_2024 = [f for f in all_pdfs if "_24_W" in f]

sorted_pdfs = sorted(pdfs_2024, key=lambda x: int(re.search(r'_W(\d+)\.pdf$', x).group(1)))

print("Sorted PDFs for 2024:", sorted_pdfs)

for pdf in sorted_pdfs:
    input_pdf = os.path.join("PDFs_2024", pdf)
    output_img = os.path.join("PDFs_Lines_2024", f"Lines_{pdf.replace('.pdf','')}_page3.png")
    enhance_table_lines_from_pdf_hq(input_pdf,
                                    output_img,
                                    h1=40, s1=0, v1=210,
                                    h2=50, s2=30, v2=255,
                                    tr1=1400,
                                    linelength1=79,
                                    linegap1=50,
                                    toler1 = 10,
                                    page_number=3,
                                    dpi=600) 

Sorted PDFs for 2024: ['Nigeria_04_Jan_24_W1.pdf', 'Nigeria_11_Jan_24_W2.pdf', 'Nigeria_18_Jan_24_W3.pdf', 'Nigeria_25_Jan_24_W4.pdf', 'Nigeria_01_Feb_24_W5.pdf', 'Nigeria_08_Feb_24_W6.pdf', 'Nigeria_15_Feb_24_W7.pdf', 'Nigeria_22_Feb_24_W8.pdf', 'Nigeria_29_Feb_24_W9.pdf', 'Nigeria_07_Mar_24_W10.pdf', 'Nigeria_14_Mar_24_W11.pdf', 'Nigeria_21_Mar_24_W12.pdf', 'Nigeria_28_Mar_24_W13.pdf', 'Nigeria_04_Apr_24_W14.pdf', 'Nigeria_11_Apr_24_W15.pdf', 'Nigeria_18_Apr_24_W16.pdf', 'Nigeria_25_Apr_24_W17.pdf', 'Nigeria_02_May_24_W18.pdf', 'Nigeria_09_May_24_W19.pdf', 'Nigeria_16_May_24_W20.pdf', 'Nigeria_23_May_24_W21.pdf', 'Nigeria_30_May_24_W22.pdf', 'Nigeria_06_Jun_24_W23.pdf', 'Nigeria_13_Jun_24_W24.pdf', 'Nigeria_20_Jun_24_W25.pdf', 'Nigeria_27_Jun_24_W26.pdf', 'Nigeria_04_Jul_24_W27.pdf', 'Nigeria_11_Jul_24_W28.pdf', 'Nigeria_18_Jul_24_W29.pdf', 'Nigeria_25_Jul_24_W30.pdf', 'Nigeria_01_Aug_24_W31.pdf', 'Nigeria_08_Aug_24_W32.pdf', 'Nigeria_15_Aug_24_W33.pdf', 'Nigeria_22_Aug_24_W34.pdf', 

# Google AI LLMs

In [None]:
pip install google-generativeai

In [None]:
pip install google.genai

In [65]:
from google import genai
from google.genai import types
import PIL.Image

load_dotenv()

image = PIL.Image.open('PDFs_Lines_Test/Lines_Nigeria_21_Mar_24_W12_page3.png')

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=["What is this image?", image])

print(response.text)

The image is a table titled "Lassa Fever Situation Report" showing the weekly and cumulative number of suspected and confirmed cases for 2024. The data is broken down by state and includes information on suspected cases, confirmed cases, trend, probable cases among healthcare workers (HCW), and deaths. The table covers Epi Week 12 of 2024 and the cumulative data for weeks 1-12.



## Option 1 - outputting CSV-formatted text

In [None]:
import os
import glob
from dotenv import load_dotenv
from google import genai
from PIL import Image

# Load the API key from the .env file
load_dotenv()

# Configure the Gemini client with your API key
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

# Define the prompt that instructs the model how to extract the table data.
prompt_template = """
The provided image contains a table with a section labeled "Current Week". Your task is to extract the data from this section only and ignore any cumulative columns.
The "Current Week" section has the following columns in this exact left-to-right order:
1. States
2. Suspected
3. Confirmed
4. Trend
5. Probable
6. HCW*
7. Deaths (Confirmed Cases)

Extract the numbers (or values) located under each column header. Return the results as a CSV-formatted table where:
- The first row contains the headers exactly as listed above.
- Each subsequent row corresponds to a State.
- The last row is for the "Total" for all States.
Ensure that all columns are present in the output, even if some cells are blank.
Output the table in the same column order as given above.
"""

# Folder containing the PNG images
image_folder = "PDFs_Lines_Test"

# Get list of PNG images from the folder
image_paths = glob.glob(os.path.join(image_folder, "*.png"))

if not image_paths:
    print(f"No PNG images found in the folder '{image_folder}'. Please check the folder name and path.")
else:
    for image_path in image_paths:
        print("Processing image:", image_path)
        
        # Open the image using Pillow
        try:
            image = Image.open(image_path)
        except Exception as e:
            print(f"Error opening image {image_path}: {e}")
            continue

        # Call the Gemini model. We send the prompt along with the image as context.
        # (Depending on the API, the image can be passed as part of the contents list.)
        try:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                # Pass a list containing the prompt and the image.
                # (The library is designed to accept multiple contents; if not, you may need to adapt this.)
                contents=[prompt_template, image]
            )
        except Exception as e:
            print(f"Error during API call for image {image_path}: {e}")
            continue

        # Print the extracted table data
        print("Extracted table data:")
        print(response.text)
        print("-" * 80)


## Option 2 - using structured JSON outputs

In [None]:
import os
import glob
from dotenv import load_dotenv
from google import genai
from PIL import Image
from pydantic import BaseModel, Field

# Load the API key from the .env file
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY is not set in your .env file. Please add it and restart the notebook.")

# Initialize the Gemini client with your API key
client = genai.Client(api_key=api_key)

# Define the Pydantic model for one row of the table.
# We use Field aliases so that the JSON keys match the column names exactly.
class TableRow(BaseModel):
    States: str = Field(..., alias="States")
    Suspected: str = Field(..., alias="Suspected")
    Confirmed: str = Field(..., alias="Confirmed")
    Trend: str = Field(..., alias="Trend")
    Probable: str = Field(..., alias="Probable")
    HCW: str = Field(..., alias="HCW*")
    Deaths: str = Field(..., alias="Deaths (Confirmed Cases)")

# Define the prompt with instructions to extract JSON formatted output.
prompt_template = """
The provided image contains a table with a section labeled "Current Week". Your task is to extract the data from this section only and ignore any cumulative columns.
The "Current Week" section has the following columns in this exact left-to-right order:
1. States
2. Suspected
3. Confirmed
4. Trend
5. Probable
6. HCW*
7. Deaths (Confirmed Cases)

Extract the values located under each column header and return the results in JSON format.
Return a JSON list of objects, where each object corresponds to one row of the table.
Each object must have the following keys (exactly in this order):
"States", "Suspected", "Confirmed", "Trend", "Probable", "HCW*", "Deaths (Confirmed Cases)"
Include one object per state, and the last object should correspond to the "Total" row.
Ensure that all keys are present in every object, even if some values are blank.
Output the JSON in valid format.
"""

# Folder containing the PNG images
image_folder = "PDFs_Lines_Test"  # Adjust this to your local folder name

# Get list of PNG images from the folder
image_paths = glob.glob(os.path.join(image_folder, "*.png"))
if not image_paths:
    print(f"No PNG images found in the folder '{image_folder}'. Please check the folder name and path.")
else:
    for image_path in image_paths:
        print("Processing image:", image_path)
        
        # Open the image using Pillow
        try:
            image = Image.open(image_path)
        except Exception as e:
            print(f"Error opening image {image_path}: {e}")
            continue

        # Call the Gemini model using structured JSON output.
        try:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=[prompt_template, image],
                config={
                    "response_mime_type": "application/json",
                    "response_schema": list[TableRow],
                }
            )
        except Exception as e:
            print(f"Error during API call for image {image_path}: {e}")
            continue

        # Print the raw JSON response text
        print("Raw JSON response:")
        print(response.text)
        
        # Parse the response into TableRow objects (if supported)
        try:
            # The client library may provide a parsed attribute containing Pydantic objects.
            table_rows = response.parsed  # Expected to be a list of TableRow objects.
            print("Extracted Table Data (parsed):")
            for row in table_rows:
                # Use .dict(by_alias=True) to output keys as specified in the schema.
                print(row.dict(by_alias=True))
        except Exception as e:
            print(f"Error parsing response for image {image_path}: {e}")
        print("-" * 80)


## Option 3 - same as option 2 but saving combined json as CSV

In [None]:
import os
import glob
import csv
from dotenv import load_dotenv
from google import genai
from PIL import Image
from pydantic import BaseModel, Field

# Load the API key from the .env file
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY is not set in your .env file. Please add it and restart the notebook.")

# Initialize the Gemini client with your API key
client = genai.Client(api_key=api_key)

# Define the Pydantic model for one row of the table.
# Field aliases are used so that the JSON keys match the column names exactly.
class TableRow(BaseModel):
    States: str = Field(..., alias="States")
    Suspected: str = Field(..., alias="Suspected")
    Confirmed: str = Field(..., alias="Confirmed")
    Trend: str = Field(..., alias="Trend")
    Probable: str = Field(..., alias="Probable")
    HCW: str = Field(..., alias="HCW*")
    Deaths: str = Field(..., alias="Deaths (Confirmed Cases)")

# Define the prompt with instructions to extract JSON formatted output.
prompt_template = """
The provided image contains a table with a section labeled "Current Week". Your task is to extract the data from this section only and ignore any cumulative columns.
The "Current Week" section has the following columns in this exact left-to-right order:
1. States
2. Suspected
3. Confirmed
4. Trend
5. Probable
6. HCW*
7. Deaths (Confirmed Cases)

Extract the values located under each column header and return the results in JSON format.
Return a JSON list of objects, where each object corresponds to one row of the table.
Each object must have the following keys (exactly in this order):
"States", "Suspected", "Confirmed", "Trend", "Probable", "HCW*", "Deaths (Confirmed Cases)".
"Trend" column may contain either blank cells, or one of two types of triangles: ▲ (Up, red triangle) or ▼ (Down, green triangle). You should input either "Up" or "Down" as a value.
Include one object per state, and the last object should correspond to the "Total" row.
Ensure that all keys are present in every object, even if some values are blank.
Output the JSON in valid format.
"""

# Folder containing the PNG images
image_folder = "PDFs_Lines_Test"  # Adjust this to your local folder name

# Get list of PNG images from the folder
image_paths = glob.glob(os.path.join(image_folder, "*.png"))
if not image_paths:
    print(f"No PNG images found in the folder '{image_folder}'. Please check the folder name and path.")
else:
    # Define the CSV header in the exact order we expect:
    fieldnames = [
        "States",
        "Suspected",
        "Confirmed",
        "Trend",
        "Probable",
        "HCW*",
        "Deaths (Confirmed Cases)"
    ]
    
    for image_path in image_paths:
        print("Processing image:", image_path)
        
        # Open the image using Pillow
        try:
            image = Image.open(image_path)
        except Exception as e:
            print(f"Error opening image {image_path}: {e}")
            continue

        # Call the Gemini model using structured JSON output.
        try:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=[prompt_template, image],
                config={
                    "response_mime_type": "application/json",
                    "response_schema": list[TableRow],
                }
            )
        except Exception as e:
            print(f"Error during API call for image {image_path}: {e}")
            continue

        # Print the raw JSON response text for debugging
        print("Raw JSON response:")
        print(response.text)
        
        # Parse the response into TableRow objects (if supported)
        try:
            table_rows = response.parsed  # Expected to be a list of TableRow objects.
            print("Extracted Table Data (parsed):")
            for row in table_rows:
                print(row.dict(by_alias=True))
        except Exception as e:
            print(f"Error parsing response for image {image_path}: {e}")
            continue
        
        # Create a CSV filename based on the image filename and save into the CSV_LF folder
        base_filename = os.path.splitext(os.path.basename(image_path))[0]
        csv_filename = os.path.join("CSV_LF", f"{base_filename}.csv")
        
        # Write the extracted data to CSV
        try:
            with open(csv_filename, mode="w", newline="", encoding="utf-8") as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                for row in table_rows:
                    # Convert each TableRow to a dict using aliases to preserve key names.
                    writer.writerow(row.dict(by_alias=True))
            print(f"CSV file saved as: {csv_filename}")
        except Exception as e:
            print(f"Error writing CSV for image {image_path}: {e}")
        
        print("-" * 80)


## Option 4 - 2 API calls per image to ensure accuracy

In [None]:
import os
import glob
import csv
from dotenv import load_dotenv
from google import genai
from PIL import Image
from pydantic import BaseModel, Field

# Load the API key from the .env file
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY is not set in your .env file. Please add it and restart the notebook.")

# Initialize the Gemini client with your API key
client = genai.Client(api_key=api_key)

# Define the Pydantic model for one row of the table.
class TableRow(BaseModel):
    States: str = Field(..., alias="States")
    Suspected: str = Field(..., alias="Suspected")
    Confirmed: str = Field(..., alias="Confirmed")
    Trend: str = Field(..., alias="Trend")
    Probable: str = Field(..., alias="Probable")
    HCW: str = Field(..., alias="HCW*")
    Deaths: str = Field(..., alias="Deaths (Confirmed Cases)")

# Define the prompt with instructions to extract JSON formatted output.
prompt_template = """
The provided image contains a table with a section labeled "Current Week". Your task is to extract the data from this section only.
The "Current Week" section has the following columns in this exact left-to-right order:
1. States
2. Suspected
3. Confirmed
4. Trend
5. Probable
6. HCW*
7. Deaths (Confirmed Cases)

Extract the values located under each column header and return the results in JSON format.
Return a JSON list of objects, where each object corresponds to one row of the table.

Each object must have the following keys (exactly in this order):
"States", "Suspected", "Confirmed", "Trend", "Probable", "HCW*", "Deaths (Confirmed Cases)".

"States" corresponds to the states of Nigeria: Ondo, Edo, Bauchi, Taraba, Benue, Ebonyi, Kogi, Kaduna, Plateau, Enugu, Cross River, Rivers, Delta, Nasarawa, Anambra, Gombe, Niger, Imo, Jigawa, Bayelsa, Adamawa, Fct, Katsina, Kano, Oyo, Lagos, Ogun, Yobe, Sokoto, Kebbi, Zamfara, Akwa Ibom, Ekiti, Kwara, Borno, Osun, Abia. The last row should correspond to the "Total" for all states.
If a row is blank and has no value in "States" column, you should omit it.

"Trend" column may contain either blank cells, or one of two types of triangles: ▲ (Up, red triangle) or ▼ (Down, green triangle). You should input either "Up" or "Down" as a value.

Include one object per state, and the last object should correspond to the "Total" row.
Ensure that all keys are present in every object, even if some values are blank.
Output the JSON in valid format.
"""

# Folder containing the PNG images
image_folder = "PDFs_Lines_Test"  # Adjust this to your local folder name

# Get list of PNG images from the folder
image_paths = glob.glob(os.path.join(image_folder, "*.png"))
if not image_paths:
    print(f"No PNG images found in the folder '{image_folder}'. Please check the folder name and path.")
else:
    # Define the CSV header in the exact order we expect:
    fieldnames = [
        "States",
        "Suspected",
        "Confirmed",
        "Trend",
        "Probable",
        "HCW*",
        "Deaths (Confirmed Cases)"
    ]
    
    for image_path in image_paths:
        print("Processing image:", image_path)
        
        # Open the image using Pillow
        try:
            image = Image.open(image_path)
        except Exception as e:
            print(f"Error opening image {image_path}: {e}")
            continue

        # Process the image twice
        responses = []
        for i in range(2):
            try:
                response = client.models.generate_content(
                    model="gemini-2.0-flash",
                    contents=[prompt_template, image],
                    config={
                        "response_mime_type": "application/json",
                        "response_schema": list[TableRow],
                    }
                )
                responses.append(response)
            except Exception as e:
                print(f"Error during API call for image {image_path} on iteration {i+1}: {e}")
                responses = []
                break  # Skip to next image if there's an error

        if len(responses) != 2:
            print("Skipping image due to API call errors.")
            continue

        # Parse the responses into lists of TableRow objects
        try:
            table_rows_1 = responses[0].parsed  # List[TableRow]
            table_rows_2 = responses[1].parsed  # List[TableRow]
        except Exception as e:
            print(f"Error parsing responses for image {image_path}: {e}")
            continue

        # Convert both lists into lists of dictionaries
        dict_rows_1 = [row.dict(by_alias=True) for row in table_rows_1]
        dict_rows_2 = [row.dict(by_alias=True) for row in table_rows_2]

        # Compare the two outputs
        if dict_rows_1 == dict_rows_2:
            print("Both outputs are identical. Saving CSV.")
            # Create a CSV filename based on the image filename
            base_filename = os.path.splitext(os.path.basename(image_path))[0]
            csv_filename = os.path.join("CSV_LF", f"{base_filename}.csv")
            # Write the extracted data to CSV
            try:
                with open(csv_filename, mode="w", newline="", encoding="utf-8") as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                    writer.writeheader()
                    for row in dict_rows_1:
                        writer.writerow(row)
                print(f"CSV file saved as: {csv_filename}")
            except Exception as e:
                print(f"Error writing CSV for image {image_path}: {e}")
        else:
            print("Outputs differ between iterations for image:", image_path)
            # Identify differing rows
            min_len = min(len(dict_rows_1), len(dict_rows_2))
            differences_found = False
            
            for i in range(min_len):
                if dict_rows_1[i] != dict_rows_2[i]:
                    differences_found = True
                    print(f"Difference in row {i+1}:")
                    print("Iteration 1:", dict_rows_1[i])
                    print("Iteration 2:", dict_rows_2[i])
            
            # Check for any extra rows
            if len(dict_rows_1) > min_len:
                differences_found = True
                print("Additional rows in iteration 1:")
                for i in range(min_len, len(dict_rows_1)):
                    print(f"Row {i+1}:", dict_rows_1[i])
            if len(dict_rows_2) > min_len:
                differences_found = True
                print("Additional rows in iteration 2:")
                for i in range(min_len, len(dict_rows_2)):
                    print(f"Row {i+1}:", dict_rows_2[i])
            
            if not differences_found:
                print("No individual row differences were found, despite overall inequality.")
            
            print("Skipping CSV saving for this image due to inconsistency.")
        
        print("-" * 80)

Processing image: PDFs_Lines_Test/Lines_Nigeria_16_Dec_24_W51_page3.png
Both outputs are identical. Saving CSV.
CSV file saved as: CSV_LF/Lines_Nigeria_16_Dec_24_W51_page3.csv
--------------------------------------------------------------------------------
Processing image: PDFs_Lines_Test/Lines_Nigeria_29_Aug_24_W35_page3.png
Both outputs are identical. Saving CSV.
CSV file saved as: CSV_LF/Lines_Nigeria_29_Aug_24_W35_page3.csv
--------------------------------------------------------------------------------
Processing image: PDFs_Lines_Test/Lines_Nigeria_27_Jun_24_W26_page3.png
Both outputs are identical. Saving CSV.
CSV file saved as: CSV_LF/Lines_Nigeria_27_Jun_24_W26_page3.csv
--------------------------------------------------------------------------------
Processing image: PDFs_Lines_Test/Lines_Nigeria_21_Mar_24_W12_page3.png
Both outputs are identical. Saving CSV.
CSV file saved as: CSV_LF/Lines_Nigeria_21_Mar_24_W12_page3.csv
-----------------------------------------------------

1880

# Google Vision

0.5787878787878787

In [57]:
import io
import os
from google.cloud import vision
from google.cloud.vision_v1 import types
import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

True

In [None]:
import io
import os
from google.cloud import vision
from google.cloud.vision_v1 import types
import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

# --- Configuration ---
IMAGE_FOLDER = 'PDFs_Lines_Test'  # Folder containing your PNG images.  Relative path is OK.


def detect_text_and_tables(image_path):
    """Detects text and attempts basic table structure extraction from an image.

    Args:
        image_path: Path to the image file.

    Returns:
        A tuple: (extracted_text, table_data)
        - extracted_text:  Raw text extracted from the image (string).
        - table_data: A list of lists representing the extracted table data.
                      Each inner list represents a row.  Returns None if no table
                      structure is reasonably detected.
    """
    client = vision.ImageAnnotatorClient()

    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = types.Image(content=content)

    # Perform text detection (OCR)
    response = client.document_text_detection(image=image)
    extracted_text = response.full_text_annotation.text
    
    # --- Attempt Table Extraction ---
    table_data = extract_table_data(response)  # Use the helper function

    return extracted_text, table_data

def extract_table_data(response):
    """
    Extracts table data from the Google Cloud Vision API response. This uses
    block structure, paragraph and word structure, row and column detection.

    Args:
        response:  The full response from client.document_text_detection().

    Returns:
        A list of lists representing the extracted table data, or None if
        no table-like structure is found.
    """
    blocks = response.full_text_annotation.pages[0].blocks
    
    # 1. Organize bounding box information for efficient lookup.
    bounding_boxes = {}  # Key: (page, block, paragraph, word), Value: BoundingBox
    for page_num, page in enumerate(response.full_text_annotation.pages):
        for block_num, block in enumerate(page.blocks):
            for paragraph_num, paragraph in enumerate(block.paragraphs):
                for word_num, word in enumerate(paragraph.words):
                    bounding_boxes[(page_num, block_num, paragraph_num, word_num)] = word.bounding_box
    
    #Helper function to get bounding vertices for paragraphs
    def get_paragraph_bounds(paragraph):
      """Extracts bounding box vertices from a paragraph."""
      min_x = float('inf')
      min_y = float('inf')
      max_x = float('-inf')
      max_y = float('-inf')

      for word in paragraph.words:
        for symbol in word.symbols:
          for vertex in symbol.bounding_box.vertices:
            min_x = min(min_x, vertex.x)
            min_y = min(min_y, vertex.y)
            max_x = max(max_x, vertex.x)
            max_y = max(max_y, vertex.y)
      return min_x, min_y, max_x, max_y

    # 2. Create data structure for paragraphs (rows)
    paragraphs_data = []
    
    for page_num, page in enumerate(response.full_text_annotation.pages):
      for block_num, block in enumerate(page.blocks):
          for paragraph_num, paragraph in enumerate(block.paragraphs):
            min_x, min_y, max_x, max_y = get_paragraph_bounds(paragraph) # Get Paragraph bounds
            paragraph_text = ""
            for word in paragraph.words:
                for symbol in word.symbols:
                    paragraph_text += symbol.text
            paragraphs_data.append({
                'text': paragraph_text,
                'min_x': min_x,
                'min_y': min_y,
                'max_x': max_x,
                'max_y': max_y,
            })
        
    # 3. Sort paragraphs_data by 'min_y' (top to bottom), then by 'min_x' (left to right)
    paragraphs_data.sort(key=lambda p: (p['min_y'], p['min_x']))
    
    
    #4. Row detection
    
    rows = []
    current_row = []
    if paragraphs_data:
        # Initialize with the y-coordinate of the first paragraph
        current_row_y_center = paragraphs_data[0]['min_y']
        row_height_estimate = paragraphs_data[0]['max_y'] - paragraphs_data[0]['min_y'] #initial height estimation
        
        tolerance_y = row_height_estimate * 0.5 # Allow the row height to vary 
        
        for paragraph in paragraphs_data:
            paragraph_y_center = (paragraph['min_y'] + paragraph['max_y']) / 2
            
            if abs(paragraph_y_center - current_row_y_center) <= tolerance_y :
                current_row.append(paragraph)
                
            else:
                # Sort current row by x coordinate before saving it
                current_row.sort(key=lambda p: p['min_x'])
                rows.append(current_row)
                current_row = [paragraph]  # start new row
                current_row_y_center = paragraph_y_center  #update Y of current row
                row_height_estimate = paragraph['max_y'] - paragraph['min_y']
                tolerance_y = row_height_estimate * 0.5 #update the tolerance
    
    # Append the last row if it's not empty
    if current_row:
        current_row.sort(key=lambda p: p['min_x'])
        rows.append(current_row)
        
    
    # 5. Convert to list of lists (text only) for the table
    table_data = []
    for row in rows:
        row_data = [p['text'] for p in row]
        table_data.append(row_data)
    
    if not table_data:
        return None  # No table-like structure found
    return table_data


def process_images(image_folder):
    """Processes all PNG images in a folder, extracting text and tables."""

    all_results = {}  # Store results for each image

    for filename in os.listdir(image_folder):
        if filename.lower().endswith('.png'):
            image_path = os.path.join(image_folder, filename)
            print(f"Processing: {filename}")

            extracted_text, table_data = detect_text_and_tables(image_path)

            all_results[filename] = {
                'text': extracted_text,
                'table': table_data
            }

            # Print results (optional, for immediate feedback)
            print(f"Extracted Text:\n{extracted_text}\n")

            if table_data:
                print("Extracted Table:")
                df = pd.DataFrame(table_data)
                print(df.to_string()) #Use Panda for better printing
            else:
                print("No table found.")
            print("-" * 40)

    return all_results



# --- Main Execution ---
if __name__ == '__main__':
    results = process_images(IMAGE_FOLDER)

    # --- Post-Processing (Example: Saving results to files) ---
    # You can now further process the 'results' dictionary.  For example:

    """ for filename, data in results.items():
        # Save extracted text to a .txt file
        text_filename = os.path.join(IMAGE_FOLDER, f"{filename.split('.')[0]}_text.txt")
        with open(text_filename, 'w', encoding='utf-8') as f:
            f.write(data['text'])

        # Save table data to a .csv file (if a table was found)
        if data['table']:
            table_filename = os.path.join(IMAGE_FOLDER, f"{filename.split('.')[0]}_table.csv")
            df = pd.DataFrame(data['table'])
            df.to_csv(table_filename, index=False, header=False)  # Save without row indices or headers """

In [45]:
from dotenv import load_dotenv
import io
import os
import cv2
import numpy as np
from google.cloud import vision
from google.cloud.vision_v1 import types
import pandas as pd

# --- Load Environment Variables ---
load_dotenv()

# --- Configuration ---
IMAGE_FOLDER = 'PDFs_Lines_Test'  # Your image folder

def detect_table_structure(image_path):
    """Detects table structure (lines) using OpenCV, handling no-line cases.

    Args:
        image_path: Path to the image.

    Returns:
        A tuple: (horizontal_lines, vertical_lines, image_gray).
        - horizontal_lines:  Detected horizontal lines or an empty list if none.
        - vertical_lines: Detected vertical lines or an empty list if none.
        - image_gray: The grayscale image (or None if loading failed).
    """
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Could not open or read image at {image_path}")

    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # --- Thresholding (Binarization) ---
    thresh = cv2.adaptiveThreshold(~image_gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)

    # --- Horizontal Line Detection ---
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
    horizontal_lines = cv2.erode(thresh, horizontal_kernel, iterations=3)
    horizontal_lines = cv2.dilate(horizontal_lines, horizontal_kernel, iterations=3)

    # --- Vertical Line Detection ---
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 25))
    vertical_lines = cv2.erode(thresh, vertical_kernel, iterations=3)
    vertical_lines = cv2.dilate(vertical_lines, vertical_kernel, iterations=3)
    
    return horizontal_lines, vertical_lines, image_gray


def get_table_cells(horizontal_lines, vertical_lines, image_gray):
    """
       Gets the bounding box coordinates of table cells based on detected lines.
       Creates and empty table of dimensions as detected by number of intersections

    Args:
        horizontal_lines:  Result from detect_table_structure.
        vertical_lines: Result from detect_table_structure.
        image_gray: Grayscale image.

    Returns:
        A list of lists representing the table cells' bounding box coordinates,
        where each inner list represents a row, and each element in the inner
        list is a tuple: (x1, y1, x2, y2).
    """

    # --- Combine Lines and Find Contours ---
    table_lines = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.5, 0.0)
    table_lines = cv2.erode(~table_lines, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)), iterations=1)
    (T, thresh) = cv2.threshold(table_lines, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    contours = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = contours[0] if len(contours) == 2 else contours[1]  # Handle different OpenCV versions

    # --- Get Bounding Boxes of Cells ---
    bounding_boxes = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        bounding_boxes.append((x, y, x + w, y + h))
    
    # --- Sort Bounding Boxes (Top-to-Bottom, Then Left-to-Right) ---
    bounding_boxes.sort(key=lambda bbox: (bbox[1], bbox[0])) # Sort by y first, x second
    
    # --- Determine Table Dimensions ---
    # Estimate number of columns and rows based on lines, not contours
    #This is more accurate, and then we map extracted contours to the empty table
    
    lines_v = cv2.HoughLinesP(vertical_lines, 1, np.pi / 180, 200, minLineLength=20, maxLineGap=15)
    lines_h = cv2.HoughLinesP(horizontal_lines, 1, np.pi / 180, 200, minLineLength=20, maxLineGap=15)

    #Count the number of lines to estimate columns and rows.
    if lines_v is not None:
        num_columns = len(lines_v)
    else:
        num_columns = 0  # No Vertical Lines

    if lines_h is not None:
        num_rows = len(lines_h)
    else:
        num_rows = 0
    
    #Create Empty Table Structure of cells
    if (num_rows > 0) and (num_columns > 0):
        table_cells = [[None for _ in range(num_columns)] for _ in range(num_rows)]
        # --- Map Bounding Boxes to Cells ---
        # Fill in the table structure. We iterate the extracted rectangles and insert
        #them into an appropriate cell.
        
        for x1, y1, x2, y2 in bounding_boxes:
            cell_assigned = False
            for i in range(num_rows):
                for j in range(num_columns):
                    # Check if we already have a box in table cell
                    if table_cells[i][j] is None:
                        # Check if current coordinates intersect with the area that a particular table cell would cover
                        # Get Line coordinates (extrapolate a bit to not miss edge cases)
                        y_start = lines_h[i][0][1] - 5 if lines_h is not None else y1 # use y from the box as backup
                        y_end = lines_h[i+1][0][1] + 5 if i+1<len(lines_h) and lines_h is not None else y2  # use y from the box as backup
                        
                        x_start = lines_v[j][0][0] - 5 if lines_v is not None else x1 # use x from the box as backup
                        x_end = lines_v[j+1][0][0] + 5 if j+1 < len(lines_v) and lines_v is not None else x2 # use x from the box as backup
                        
                        
                        # Check for overlap
                        if (x1 >= x_start and x2 <= x_end and
                           y1 >= y_start and y2 <= y_end):
                            
                            #Store Bounding box into that cell.
                            table_cells[i][j] = (x1, y1, x2, y2)
                            cell_assigned = True
                            break   #exit column
                if cell_assigned:
                    break   # exit row
    else:
        table_cells = [] #return empty table if error in HoughLines

    return table_cells

def extract_text_from_cells(table_cells, image_gray, client):
    """Extracts text from each cell using Google Cloud Vision API.

    Args:
        table_cells: The list of lists representing cell bounding boxes.
        image_gray:  The grayscale image.
        client: The Google Cloud Vision API client.

    Returns:
        A list of lists representing the extracted text from each cell.
    """
    table_text = []

    for row_cells in table_cells:
        row_text = []
        for x1, y1, x2, y2 in row_cells:
            if (x1, y1, x2, y2) is not None:
                # Crop the cell region from the grayscale image
                cell_image = image_gray[y1:y2, x1:x2]

                # Convert the OpenCV image to bytes (required by Vision API)
                _, encoded_image = cv2.imencode('.png', cell_image)
                content = encoded_image.tobytes()
                image = types.Image(content=content)

                # Perform OCR on the cell image
                response = client.document_text_detection(image=image)
                cell_text = response.full_text_annotation.text if response.full_text_annotation else ""

                row_text.append(cell_text)
            else:
                row_text.append("") #handle empty cells
        table_text.append(row_text)

    return table_text


def detect_text_and_tables(image_path):
    """Main function to detect text and tables, handling errors."""
    try:
        horizontal_lines, vertical_lines, image_gray = detect_table_structure(image_path)
        table_cells = get_table_cells(horizontal_lines, vertical_lines, image_gray)

        client = vision.ImageAnnotatorClient()
        table_data = extract_text_from_cells(table_cells, image_gray, client)

        # Extract all text from the image (for the 'extracted_text' part)
        with io.open(image_path, 'rb') as image_file:
            content = image_file.read()
        image = types.Image(content=content)
        response = client.document_text_detection(image=image)
        extracted_text = response.full_text_annotation.text

        return extracted_text, table_data

    except ValueError as ve:
        print(f"ValueError: {ve}")
        return None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None


def process_images(image_folder):
    """Processes images and extracts text/tables."""
    all_results = {}

    for filename in os.listdir(image_folder):
        if filename.lower().endswith('.png'):
            image_path = os.path.join(image_folder, filename)
            print(f"Processing: {filename}")

            try:  # Add a try-except block here
                extracted_text, table_data = detect_text_and_tables(image_path)
            except TypeError as e:
                print(f"TypeError occurred: {e}")
                import pdb; pdb.set_trace()  # Add this line!
                extracted_text, table_data = None, None #placeholder to avoid further errors.

            if extracted_text is not None:
                all_results[filename] = {
                    'text': extracted_text,
                    'table': table_data
                }

                print(f"Extracted Text:\n{extracted_text}\n")
                if table_data:
                    print("Extracted Table:")
                    df = pd.DataFrame(table_data)
                    print(df.to_string())  # Use Pandas for better printing
                else:
                    print("No table found.")
                print("-" * 40)

    return all_results


if __name__ == '__main__':
    if not os.path.exists(IMAGE_FOLDER):
        os.makedirs(IMAGE_FOLDER)

    results = process_images(IMAGE_FOLDER)

    """ for filename, data in results.items():
        text_filename = os.path.join(IMAGE_FOLDER, f"{filename.split('.')[0]}_text.txt")
        with open(text_filename, 'w', encoding='utf-8') as f:
            f.write(data['text'])
        if data['table']:
            table_filename = os.path.join(IMAGE_FOLDER, f"{filename.split('.')[0]}_table.csv")
            df = pd.DataFrame(data['table'])
            df.to_csv(table_filename, index=False, header=False) """

Processing: Lines_Nigeria_21_Mar_24_W12_page3.png
An unexpected error occurred: cannot unpack non-iterable NoneType object


In [47]:
extracted_text, table_data = detect_text_and_tables(image_path)

An unexpected error occurred: cannot unpack non-iterable NoneType object


In [41]:
results

{'Lines_Nigeria_21_Mar_24_W12_page3.png': {'text': 'Epi Week: 12 2024\nCumulative (Week 1 - 12)\nCases\nDeaths\nLassa Fever Situation Report\nTable 3. Weekly and Cumulative number of suspected and confirmed cases for 2024\nCurrent week: (Week 12)\nCases\nDeaths\nStates\nSuspected Confirmed Trend Probable HCW* (Confirmed Cases) Suspected Confirmed Probable HCW* (Confirmed Cases)\n1 Ondo\n60\n8\n1026\n184\n3\n13\n2 Edo\n77\n4\n1171\n178\n1\n23\n3 Bauchi\n34\n4❘ Taraba\n27\n5 Benue\n29\n352\n1\n566\n123\n3\n27\n1\n190\n95\n3\n20\n2\n999\n62\n9\n8\n00\n11\n6 Ebonyi\n17\n1\n1\n213\n43\n6\n24\n7 Kogi\n2\n1\n1\n94\n28\n1\n1\n2\n8 Kaduna\n7\n97\n15\n2\n3\n8\n9 Plateau\n9\n1\n63\n9\n10 Enugu\n1\n63\n8\n1\n11 Cross River\n12\n1\n45\n7\n1\n12 Rivers\n56\n5\n3\n13 Delta\n3\n54\n4\n2\n14 Anambra\n1\n17\n4\n1\n3\n15 Nasarawa\n37\n4\n1\n1\n16 Niger\n8\n3\nN\n17 Gombe\n25\n3\n5\n1\n18 Imo\n2\n30\n3\n1\n2\n19 Jigawa\n2\n22\n2\n1\n20 Bayelsa\n1\n14\n21 Adamawa\n1\n12\n22 Fct\n1\n42\nN N N\n2\n1\n23 Kano

# 15% left

In [19]:
import cv2
import numpy as np
from PIL import Image, ImageColor
import fitz  # PyMuPDF

def hex_to_hsv(hex_color):
    rgb = ImageColor.getcolor(hex_color, "RGB")
    r, g, b = [x / 255.0 for x in rgb]
    hsv = cv2.cvtColor(
        np.uint8([[[b * 255, g * 255, r * 255]]]),
        cv2.COLOR_BGR2HSV
    )[0][0]
    return hsv

def enhance_table_lines_from_pdf_hq(pdf_path, output_path, page_number=0, dpi=300):
    """
    Enhances vertical column separators and draws horizontal lines at
    top boundary, bottom boundary, and header bottom.

    Args:
        pdf_path (str): Path to the PDF.
        output_path (str): Path to save the image (use .png).
        page_number (int): Page to process (0-indexed).
        dpi (int): Resolution for rendering the PDF page.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number]

    # 1. Render the PDF page at high DPI
    pix = page.get_pixmap(dpi=dpi) 
    img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    # Convert PIL Image to OpenCV BGR
    img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

    # 2. Convert to HSV & detect green rows
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    height, width = img.shape[:2]
    
    # We'll only check the left 15% of the page's width
    left_15_width = int(width * 0.15)

    target_hsv = hex_to_hsv("#D8EDCF")
    tolerance = 15
    lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50])
    upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255])
    green_mask = cv2.inRange(hsv, lower_green, upper_green)

    # Sum only the left 15% columns
    green_mask_left = green_mask[:, :left_15_width]
    h_proj_green = np.sum(green_mask_left, axis=1)
    green_row_indices = np.where(h_proj_green > 0)[0]

    if len(green_row_indices) == 0:
        print("No green rows detected.")
        return

    top_boundary = green_row_indices[0]
    bottom_boundary = green_row_indices[-1]

    # 3. Header Row Detection (just above top_boundary)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    header_region = gray[:top_boundary, :]

    # Use Otsu’s threshold for the header
    _, binary_header = cv2.threshold(
        header_region, 0, 255,
        cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
    )
    h_proj_header = np.sum(binary_header, axis=1)

    header_bottom = 0
    for i in range(len(h_proj_header) - 1, 0, -1):
        if h_proj_header[i] > 40:
            header_bottom = i
            break

    # 4. Adaptive Thresholding in the table region
    table_region = gray[top_boundary:bottom_boundary, :]
    thresh_table = cv2.adaptiveThreshold(
        table_region,
        255,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY_INV,
        11,
        3
    )

    # 5. Hough Lines to find vertical lines
    lines = cv2.HoughLinesP(thresh_table, 1, np.pi / 180,
                            threshold=775, minLineLength=10, maxLineGap=8)
    vertical_lines = []
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            if abs(x2 - x1) < 5:  # near-vertical
                # Adjust back to full image coordinates
                vertical_lines.append((x1, y1 + top_boundary, x2, y2 + top_boundary))

    # 6. Draw lines on the OpenCV image
    # A) Draw the vertical lines from header_bottom to bottom_boundary
    for x1, y1, x2, y2 in vertical_lines:
        cv2.line(img, (x1, header_bottom), (x2, bottom_boundary), (100, 100, 100), 1)

    # B) Draw horizontal lines at top, bottom, and header_bottom
    cv2.line(img, (0, top_boundary), (width, top_boundary), (0, 255, 0), 2)     # green
    cv2.line(img, (0, bottom_boundary), (width, bottom_boundary), (0, 255, 0), 2) # green
    cv2.line(img, (0, header_bottom), (width, header_bottom), (0, 0, 255), 2)  # red

    # 7. Convert back to PIL and save
    output_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    output_pil.save(output_path)

    print(f"Saved enhanced table to: {output_path}")


In [20]:
for week in [1,2,3,4,12,22,26,32,42,52]:
    enhance_table_lines_from_pdf_hq(f"PDFs/W{week}.pdf", f"Lines_W{week}_boundaries.png", page_number=3, dpi=300)

Saved enhanced table to: Lines_W1_boundaries.png
Saved enhanced table to: Lines_W2_boundaries.png
Saved enhanced table to: Lines_W3_boundaries.png
Saved enhanced table to: Lines_W4_boundaries.png
Saved enhanced table to: Lines_W12_boundaries.png
Saved enhanced table to: Lines_W22_boundaries.png
Saved enhanced table to: Lines_W26_boundaries.png
Saved enhanced table to: Lines_W32_boundaries.png
Saved enhanced table to: Lines_W42_boundaries.png
Saved enhanced table to: Lines_W52_boundaries.png


In [None]:
import os
import re
from pathlib import Path

def rename_lassa_files(folder_path):
    """
    Renames 'An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf'
    to 'Nigeria_04_Nov_24_W45.pdf', extracting day=04, month=11 => 'Nov', year=24,
    and the week number 45.
    
    Args:
        folder_path (str): Path to the folder that contains the PDF files.
    """
    # For mapping month number to short name
    month_map = {
        "01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
        "05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
        "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec",
    }

    folder = Path(folder_path)
    for file_path in folder.iterdir():
        if not file_path.is_file():
            continue
        if not file_path.suffix.lower() == ".pdf":
            continue
        
        old_name = file_path.name
        # Example old_name: "An_update_of_Lassa_fever_outbreak_in_Nigeria_041124_45.pdf"
        
        # 1) Split on underscores
        parts = old_name.split("_")
        # e.g. ["An","update","of","Lassa","fever","outbreak","in","Nigeria","041124","45.pdf"]
        
        if len(parts) < 9:
            # If the file name doesn't match the expected pattern, skip it
            print(f"Skipping file (unrecognized pattern): {old_name}")
            continue
        
        # 2) The date chunk is parts[8] like "041124"
        date_str = parts[8]  # "041124"
        
        # 3) The week chunk is in parts[9], but includes ".pdf" at the end, e.g. "45.pdf"
        week_str_pdf = parts[9]  # "45.pdf"
        # Remove ".pdf" from the end
        if week_str_pdf.endswith(".pdf"):
            week_str = week_str_pdf.replace(".pdf", "")
        else:
            print(f"Skipping file (no .pdf in last part): {old_name}")
            continue
        
        # 4) date_str should be 6 characters: DDMMYY
        if len(date_str) != 6:
            print(f"Skipping file (date string not 6 chars): {old_name}")
            continue
        dd = date_str[0:2]   # "04"
        mm = date_str[2:4]   # "11"
        yy = date_str[4:6]   # "24"
        
        # 5) Convert mm => month name
        month_name = month_map.get(mm, "???" )  # fallback "???"
        
        # 6) Build new name
        # e.g. "Nigeria_04_Nov_24_W45.pdf"
        new_name = f"Nigeria_{dd}_{month_name}_{yy}_W{week_str}.pdf"
        
        new_path = folder / new_name
        # 7) Rename the file
        print(f"Renaming:\n  {old_name}\n-> {new_name}\n")
        file_path.rename(new_path)

# Example usage:
if __name__ == "__main__":
    rename_lassa_files("PDFs")
