# Final Project

## Creating an auto updating website with the weekly dengue data from Ministry of Health, Sri Lanka:https://www.epid.gov.lk/weekly-epidemiological-report/weekly-epidemiological-report

In [1]:
#scraping the data from the website of Ministry of Health, Sri Lanka
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse

def get_latest_weekly_pdf():
    url = "https://www.epid.gov.lk/weekly-epidemiological-report/weekly-epidemiological-report"
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.content, 'html.parser')
    
    # 1. Find all section headers starting with "Weekly Epidemiological Report"
    section_headers = soup.find_all('h4', string=lambda text: text and text.startswith("Weekly Epidemiological Report"))
    if not section_headers:
        raise Exception("No weekly report sections found")
    
    # 2. Take the first section (most recent week)
    section_header = section_headers[0]
    
    # 3. The corresponding content div is the next sibling
    content_div = section_header.find_parent('div', class_='content-title').find_next_sibling('div', class_='content')
    if not content_div:
        raise Exception("Could not find content div for the latest section")
    
    # 4. Find the first PDF link inside this content div
    link_tag = content_div.select_one('a.btn')
    if not link_tag:
        raise Exception("No PDF link found in the section")
    
    latest_pdf_url = link_tag['href']
    return latest_pdf_url

# Example usage
latest_pdf = get_latest_weekly_pdf()
print("Latest weekly report PDF URL:", latest_pdf)

# ⭐ Extract EXACT filename from URL (handles names like en_68f5d334eb339_Vol_52_no_36-english.pdf)
parsed = urlparse(latest_pdf)
filename = os.path.basename(parsed.path)

# ⭐ Download the PDF with exact filename
pdf_data = requests.get(latest_pdf).content
with open(filename, "wb") as f:
    f.write(pdf_data)

print("Saved as:", filename)


Latest weekly report PDF URL: https://www.epid.gov.lk/storage/post/pdfs/en_68f5d334eb339_Vol_52_no_36-english.pdf
Saved as: en_68f5d334eb339_Vol_52_no_36-english.pdf


In [2]:
#reading the pdf and extracting the data from the table into a dataframe
#%pip install --upgrade natural-pdf

from natural_pdf import PDF

pdf = PDF("en_68f5d334eb339_Vol_52_no_36-english.pdf")
page = pdf.pages[2]
region = page.find("text:contains(Table 1)").below(until='text:contains(Page 3)', include_endpoint=False)
df = region.rotate(90).extract_table().to_df().iloc[1:].iloc[:-1].reset_index(drop=True)
df


Unnamed: 0,RDHS,Dengue Fever,None,Dysentery,None.1,Encephalitis,None.2,En. Fever,None.3,F. Poisoning,...,Chickenpox,None.4,Meningitis,None.5,Leishmania-,None.6,Tuberculosis,None.7,WRCD,None.8
0,Colombo,104,8504,0,24,0,11,0,11,0,...,2,411,1,53,0,3,38,1381,100,100
1,Gampaha,63,5573,4,38,0,25,0,3,0,...,14,617,1,126,1,35,32,813,100,100
2,Kalutara,40,1834,1,31,0,6,1,18,0,...,14,645,0,37,0,2,11,387,100,98
3,Kandy,94,3320,4,42,0,3,0,7,0,...,18,400,2,20,9,59,8,454,100,100
4,Matale,11,924,0,20,0,2,1,1,0,...,5,99,0,8,7,209,3,108,100,100
5,Nuwara Eliya,5,253,3,68,0,6,1,5,2,...,11,214,4,28,0,0,3,197,100,100
6,Galle,37,1556,6,39,0,4,2,7,3,...,29,558,6,123,0,3,8,352,100,100
7,Hambantota,18,701,5,34,0,5,0,0,1,...,9,246,2,20,14,216,0,107,100,100
8,Matara,26,1236,2,14,0,2,0,1,0,...,8,298,1,33,1,77,2,119,100,100
9,Jaffna,12,916,3,74,0,2,0,15,1,...,1,259,1,22,0,0,4,154,100,93


In [3]:
#making a copy of the data
list_of_columns =list(df.columns)


In [4]:
#Renaming the columns
i = 0
for name in list_of_columns:

    if name == None:
        list_of_columns[i] = list_of_columns[i-1] + " Cumulative"

    i +=1 

list_of_columns
df.columns =list_of_columns
df

Unnamed: 0,RDHS,Dengue Fever,Dengue Fever Cumulative,Dysentery,Dysentery Cumulative,Encephalitis,Encephalitis Cumulative,En. Fever,En. Fever Cumulative,F. Poisoning,...,Chickenpox,Chickenpox Cumulative,Meningitis,Meningitis Cumulative,Leishmania-,Leishmania- Cumulative,Tuberculosis,Tuberculosis Cumulative,WRCD,WRCD Cumulative
0,Colombo,104,8504,0,24,0,11,0,11,0,...,2,411,1,53,0,3,38,1381,100,100
1,Gampaha,63,5573,4,38,0,25,0,3,0,...,14,617,1,126,1,35,32,813,100,100
2,Kalutara,40,1834,1,31,0,6,1,18,0,...,14,645,0,37,0,2,11,387,100,98
3,Kandy,94,3320,4,42,0,3,0,7,0,...,18,400,2,20,9,59,8,454,100,100
4,Matale,11,924,0,20,0,2,1,1,0,...,5,99,0,8,7,209,3,108,100,100
5,Nuwara Eliya,5,253,3,68,0,6,1,5,2,...,11,214,4,28,0,0,3,197,100,100
6,Galle,37,1556,6,39,0,4,2,7,3,...,29,558,6,123,0,3,8,352,100,100
7,Hambantota,18,701,5,34,0,5,0,0,1,...,9,246,2,20,14,216,0,107,100,100
8,Matara,26,1236,2,14,0,2,0,1,0,...,8,298,1,33,1,77,2,119,100,100
9,Jaffna,12,916,3,74,0,2,0,15,1,...,1,259,1,22,0,0,4,154,100,93


In [5]:
#saving to csv
df.to_csv('Epid.csv', index= False)