In [8]:
import os
import time
import requests
import pandas as pd
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def log(msg): print(msg, flush=True)

# setting paths and base URL for saving PDFs and launching the HMDA site
CHROMEDRIVER_PATH = r"C:\Users\HEMANTH ADITYA\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
SAVE_DIR = r"C:\Users\HEMANTH ADITYA\OneDrive\Desktop\H"
BASE_URL = "https://lakes.hmda.gov.in/"
os.makedirs(SAVE_DIR, exist_ok=True)

# launching Chrome with Selenium and maximizing the window
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=options)
wait = WebDriverWait(driver, 20)

# opening the HMDA lakes page
driver.get(BASE_URL)
time.sleep(5)

# scrolling down the page to ensure all lake entries are loaded
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# waiting until table rows are fully loaded and then counting them
wait.until(EC.presence_of_element_located((By.XPATH, "//table//tr[position()>1]")))
initial_rows = driver.find_elements(By.XPATH, "//table//tr[position()>1]")
log(f"📄 Found {len(initial_rows)} lakes.")

success, failed = [], []

# iterating over each lake row starting from the first one
for i in range(1, len(initial_rows)):
    try:
        rows = driver.find_elements(By.XPATH, "//table//tr[position()>1]")
        row = rows[i]
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) < 9:
            log(f"⚠️ Row {i+1} skipped: only {len(cols)} columns")
            continue

        # extracting lake name and ID from the row
        lake_name = cols[4].text.strip().replace(" ", "_").replace("/", "-")
        lake_id = cols[5].text.strip().replace("/", "-")

        # finding the FTL PDF link in the last column
        ftl_link = cols[8].find_element(By.TAG_NAME, "a")
        if not ftl_link:
            log(f"❌ No FTL link for {lake_name}")
            failed.append({"LakeID": lake_id, "LakeName": lake_name, "Error": "No FTL link"})
            continue

        log(f"\n➡️ [{i+1}] Clicking FTL for: {lake_id} - {lake_name}")
        ftl_link.click()
        time.sleep(5)

        # capturing the redirected PDF URL
        current_url = driver.current_url
        log(f"🌐 Current URL after click: {current_url}")

        filename = f"{lake_id}_{lake_name}.pdf"
        filepath = os.path.join(SAVE_DIR, filename)

        # skipping download if file already exists
        if os.path.exists(filepath):
            log(f"⏭️ Already downloaded: {filename}")
            continue

        # checking if the URL actually points to a PDF
        if not current_url.endswith(".pdf"):
            raise Exception("❌ Not redirected to a valid PDF URL")

        # downloading the PDF using requests
        headers = {
            "Referer": BASE_URL,
            "User-Agent": "Mozilla/5.0"
        }

        r = requests.get(current_url, headers=headers, timeout=15)
        if r.status_code == 200:
            with open(filepath, "wb") as f:
                f.write(r.content)
            log(f"✅ Downloaded: {filename}")
            success.append({"LakeID": lake_id, "LakeName": lake_name, "PDF_URL": current_url})
        else:
            raise Exception(f"HTTP {r.status_code}")

        # returning to the main page and scrolling again to reload rows
        driver.get(BASE_URL)
        time.sleep(4)
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    except Exception as e:
        log(f"❌ Failed for {lake_name}: {e}")
        failed.append({"LakeID": lake_id, "LakeName": lake_name, "Error": str(e)})
        try:
            driver.get(BASE_URL)
            time.sleep(4)
        except:
            log("❌ Critical error. Exiting.")
            break

# closing the browser once all rows are processed
driver.quit()

# saving success and failure logs into CSV files
pd.DataFrame(success).to_csv("hmda_success_final.csv", index=False)
pd.DataFrame(failed).to_csv("hmda_failed_final.csv", index=False)
log(f"\n🎯 Done. ✅ {len(success)} downloaded, ❌ {len(failed)} failed.")


📄 Found 2944 lakes.

➡️ [2300] Clicking FTL for: 1700-EEN-11 - Katwal_Kunta
🌐 Current URL after click: https://lakes.hmda.gov.in/hmdalake/1700-EEN-11%20Katwal%20Kunta_FTL.pdf
✅ Downloaded: 1700-EEN-11_Katwal_Kunta.pdf

➡️ [2301] Clicking FTL for: 2015 - Antappa_Cheruvu
🌐 Current URL after click: https://lakes.hmda.gov.in/hmdalake/2015Antappa%20Cheruvu_FTL.pdf
✅ Downloaded: 2015_Antappa_Cheruvu.pdf

➡️ [2302] Clicking FTL for: 2815 - Kothinaru_Cheruvu
🌐 Current URL after click: https://lakes.hmda.gov.in/hmdalake/2815_Kothinaru%20Cheruvu_FTL.pdf
✅ Downloaded: 2815_Kothinaru_Cheruvu.pdf

➡️ [2303] Clicking FTL for: 2864 - Saravani_Kunta
🌐 Current URL after click: https://lakes.hmda.gov.in/hmdalake/2864Saravani%20Kunta_FTL.pdf
✅ Downloaded: 2864_Saravani_Kunta.pdf

➡️ [2304] Clicking FTL for: 2836 - Kotha_Cheruvu
🌐 Current URL after click: https://lakes.hmda.gov.in/hmdalake/2836Kotha%20Cheruvu_FTL.pdf
✅ Downloaded: 2836_Kotha_Cheruvu.pdf

➡️ [2305] Clicking FTL for: 2837 - Chenna_Cheruvu
🌐