In [3]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from typing import List, Tuple, Dict
import uuid
import json

In [4]:
BASE_URL = "https://indiankanoon.org"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0 Safari/537.36"
    )
}

In [15]:
url = "https://indiankanoon.org/browselaws/"
base_url="https://indiankanoon.org/"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
statelinks = [base_url+x["href"] for x in soup.table.find_all("a")[4:-12]]
statenames = [statelinks[i].split("/")[-2] for i in range(len(statelinks))]
statelinks_with_search = [f"https://indiankanoon.org/search/?formInput=doctypes:{x}%20year:" for x in statenames]
statelinks_with_search

['https://indiankanoon.org/search/?formInput=doctypes:andhra-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:arunachal-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:assam-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:bihar-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:chandigarh-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:chattisgarh-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:delhi-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:goa-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:gujarat-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:haryana-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:himachal-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:jk-act%20year:',
 'https://indiankanoon.org/search/?formInput=doctypes:jharkhand-act%20year:',
 'https://indian

In [9]:
def fetch_soup(url: str) -> BeautifulSoup:
    """Fetch and parse HTML from a URL."""
    response = requests.get(url, headers=HEADERS, timeout=10)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def get_years_of_union_act(url: str) -> List[Tuple[str, int]]:
    """Extract years and document counts from the union act index page."""
    soup = fetch_soup(url)
    mainbody = soup.find("div", class_="info_indian_kanoon")
    a_tags = mainbody.table.find_all("a")
    links = [a['href'] for a in a_tags]

    count_of_docs = soup.find_all("div", class_="browselist")
    years = [
        (links[i][-5:-1], int(count_of_docs[i].a.next_sibling.strip().strip("()")))
        for i in range(len(links))
    ]
    return years


def years_to_links(years: List[Tuple[str, int]], base_url: str) -> List[str]:
    """Convert years to full links."""
    return [base_url + year for year, _ in years]


def extract_links_from_page(url: str) -> List[Dict[str, str]]:
    """Extract metadata of documents from a paginated results page."""
    links = []
    while url:
        soup = fetch_soup(url)
        results = soup.find("div", class_="results_middle").find_all("div", class_="result")

        for result in results:
            cite_tags = result.find_all("a", class_="cite_tag")
            cited_to = int(cite_tags[0].text.split()[-1]) if cite_tags else 0
            cited_by = int(cite_tags[1].text.split()[-1]) if len(cite_tags) > 1 else 0

            links.append({
                "doc_id": str(uuid.uuid4()),
                "link": BASE_URL + result.a["href"],
                "title": result.a.text,
                "docsource": result.span.text,
                "cited_to": cited_to,
                "cited_by": cited_by,
            })

        # Check if there's a "Next" page
        next_page = soup.find("div", class_="bottom").find("a", string="Next")
        url = BASE_URL + next_page['href'] if next_page else None

    return links


def organize_links(links: List[str]) -> Dict[str, List[Dict[str, str]]]:
    """Organize document links by year."""
    organized_links = {}
    for link in links:
        year = link[-4:]
        if year not in organized_links:
            organized_links[year] = extract_links_from_page(link)
    return organized_links


def attach_document_count_to_metadata(
    metadata: Dict[str, List[Dict[str, str]]],
    years: List[Tuple[str, int]]
) -> Dict[str, List[Dict[str, str]]]:
    """Attach document counts to metadata for verification."""
    metadata["document_count"] = {year: count for year, count in years}
    return metadata


def verify_document_count_in_metadata(metadata: Dict[str, List[Dict[str, str]]]) -> None:
    """Verify if document counts match expected counts."""
    for year in metadata:
        if year == "document_count":
            continue
        expected = metadata["document_count"].get(year, 0)
        found = len(metadata[year])
        if found == expected:
            print(f"✅ Document count matches for {year}")
        else:
            print(f"❌ Mismatch for {year}: expected {expected}, found {found}")


def attach_data_to_metadata(
    metadata: Dict[str, List[Dict[str, str]]],
    years: List[str]
) -> Dict[str, List[Dict[str, str]]]:
    """Fetch full document text for each case and attach it to metadata."""
    for year in years:
        for doc in metadata.get(year, []):
            link = doc["link"]
            print(f"Fetching data for {doc['title']} , year: {year}, link: {link}")

            data, success = "", False
            for attempt in range(3):  # try max 3 times
                try:
                    response = requests.get(link, headers=HEADERS, timeout=10)
                    response.raise_for_status()
                    sleep(2)  # polite delay

                    soup = BeautifulSoup(response.text, "html.parser")
                    container = soup.find("div", class_="akn-akomaNtoso")
                    if container:
                        data = container.get_text(" ", strip=True)
                        success = True
                        break
                except Exception as e:
                    print(f"Attempt {attempt+1} failed for {doc['title']} ({year}) -> {e}")
                    sleep(3)

            if not success:
                print(f"❌ Could not fetch data for {doc['title']} , year: {year}")

            doc["data"] = data

    return metadata


In [None]:
for i in range(len(statelinks_with_search)):
    print("Processing state:", statenames[i])
    base_url = statelinks_with_search[i]
    url = statelinks[i]
    years = get_years_of_union_act(url)
    links = years_to_links(years, base_url)
    metadata = organize_links(links)
    attach_document_count_to_metadata(metadata, years)
    verify_document_count_in_metadata(metadata)
    attach_data_to_metadata(metadata, [year for year, _ in years])
    with open(f"../scraped_data/{statenames[i]}.json", "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False, indent=4)
    print(f"Completed processing for state: {statenames[i]}..................................\n")