In [None]:
# Robust, Resumable, Parallel PDF Scraper with Full Metadata Extraction and Logging

import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import urllib3
from tqdm import tqdm
import random
import string
import shutil

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Constants
BASE_URL = "https://eparlib.nic.in"
START_URL = f"{BASE_URL}/handle/123456789/9"
START_PAGE = 701
END_PAGE = 702
RUN_ID = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
RUN_TAG = f"p{START_PAGE}-{END_PAGE}_{RUN_ID}"
OUTPUT_DIR = f"scrape_output_{RUN_TAG}"
PDF_DIR = os.path.join(OUTPUT_DIR, f"pdfs_{RUN_TAG}")
CSV_FILE = os.path.join(OUTPUT_DIR, f"metadata_{RUN_TAG}.csv")
XLSX_FILE = os.path.join(OUTPUT_DIR, f"metadata_{RUN_TAG}.xlsx")
LOG_FILE = os.path.join(OUTPUT_DIR, f"errors_{RUN_TAG}.log")
HEADERS = {'User-Agent': 'Mozilla/5.0'}

# Setup
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
if not os.path.exists(CSV_FILE):
    empty_df = pd.DataFrame(columns=[
        "Page", "Row", "Date", "Title", "Type", "PDF File Name",
        "Language", "Members", "Session", "Ministry",
        "Starred Type", "Question No", "Member Name", "Session No", "Part No",
        "Full Type", "Author", "Error"])
    empty_df.to_csv(CSV_FILE, index=False)

# Load existing metadata and downloaded files
existing_df = pd.read_csv(CSV_FILE)
existing_titles = set(existing_df["Title"].dropna().astype(str).str.strip())
downloaded_files = set(os.listdir(PDF_DIR))

# Log function
def log_error(msg):
    with open(LOG_FILE, "a") as f:
        f.write(msg + "\n")

# Check if file is a valid PDF
def is_valid_pdf(file_path):
    try:
        with open(file_path, 'rb') as f:
            return f.read(5) == b"%PDF-"
    except Exception:
        return False

# Extract field from metadata table
def extract_field(soup, label):
    try:
        row = soup.find("td", class_="metadataFieldLabel", string=lambda x: x and label in x)
        if row:
            return row.find_next("td").get_text(strip=True)
    except:
        return ""
    return ""

# Process a single row
def process_row(page_num, row_num, row):
    try:
        tds = row.find_all("td")
        if len(tds) != 4:
            return None

        date = tds[0].text.strip()
        title = tds[1].text.strip()
        type_ = tds[2].text.strip()
        view_link = urljoin(BASE_URL, tds[3].find("a")["href"])

        if title in existing_titles:
            return None

        res = requests.get(view_link, headers=HEADERS, verify=False, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")

        file_name = ""
        for a in soup.select("a.btn.btn-primary"):
            href = a.get("href", "")
            if "hindi" in href.lower():
                continue
            file_url = urljoin(BASE_URL, href)
            file_name = os.path.basename(file_url)
            if file_name in downloaded_files:
                break
            r = requests.get(file_url, headers=HEADERS, verify=False, timeout=15)
            if r.headers.get("Content-Type") != "application/pdf":
                raise ValueError("Not a PDF")
            path = os.path.join(PDF_DIR, file_name)
            with open(path, "wb") as f:
                f.write(r.content)
            if not is_valid_pdf(path):
                raise ValueError("Corrupted or partial PDF")
            downloaded_files.add(file_name)
            break

        metadata = {
            "Page": page_num,
            "Row": row_num,
            "Date": date,
            "Title": title,
            "Type": type_,
            "PDF File Name": file_name,
            "Language": extract_field(soup, "Language"),
            "Members": extract_field(soup, "Members"),
            "Session": extract_field(soup, "Session"),
            "Ministry": extract_field(soup, "Ministry"),
            "Starred Type": extract_field(soup, "Starred"),
            "Question No": extract_field(soup, "Question No"),
            "Member Name": extract_field(soup, "Member"),
            "Session No": extract_field(soup, "Session No"),
            "Part No": extract_field(soup, "Part No"),
            "Full Type": extract_field(soup, "Type"),
            "Author": extract_field(soup, "Author"),
            "Error": ""
        }

        pd.DataFrame([metadata]).to_csv(CSV_FILE, mode="a", header=False, index=False)
        return metadata

    except Exception as e:
        error_msg = f"Page {page_num}, Row {row_num}: {str(e)}"
        log_error(error_msg)
        metadata = {
            "Page": page_num,
            "Row": row_num,
            "Date": date if 'date' in locals() else "",
            "Title": title if 'title' in locals() else "",
            "Type": type_ if 'type_' in locals() else "",
            "PDF File Name": file_name if 'file_name' in locals() else "",
            "Language": "",
            "Members": "",
            "Session": "",
            "Ministry": "",
            "Starred Type": "",
            "Question No": "",
            "Member Name": "",
            "Session No": "",
            "Part No": "",
            "Full Type": "",
            "Author": "",
            "Error": str(e)
        }
        pd.DataFrame([metadata]).to_csv(CSV_FILE, mode="a", header=False, index=False)
        return None

# Main scraping loop
def scrape_pages(start_page, end_page):
    for page_num in range(start_page, end_page + 1):
        url = f"{START_URL}?offset={page_num * 20}"
        print(f"\n🔄 Processing Page {page_num}: {url}")
        try:
            res = requests.get(url, headers=HEADERS, verify=False, timeout=10)
            soup = BeautifulSoup(res.text, "html.parser")
            rows = soup.select("table.panel.table.table-bordered.table-hover tr")[1:]

            if not rows:
                print("✅ No more rows on this page.")
                break

            with ThreadPoolExecutor(max_workers=5) as executor:
                entries = [(page_num, i, row) for i, row in enumerate(rows)]
                futures = [executor.submit(process_row, *entry) for entry in entries]
                for _ in tqdm(as_completed(futures), total=len(futures), desc=f"Page {page_num}"):
                    pass

        except Exception as e:
            log_error(f"Page {page_num} failed: {e}")

# Run scraper
scrape_pages(START_PAGE, END_PAGE)

# Save Excel version
pd.read_csv(CSV_FILE).to_excel(XLSX_FILE, index=False)

# Zip final folder
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

print(f"\n✅ Scraping completed. All files saved in folder '{OUTPUT_DIR}' and zipped as '{OUTPUT_DIR}.zip'")
from google.colab import files
files.download(f"{OUTPUT_DIR}.zip")