In [1]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from typing import List, Tuple, Dict
import uuid
import json

BASE_URL = "https://indiankanoon.org"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0 Safari/537.36"
    )
}


def fetch_soup(url: str) -> BeautifulSoup:
    """Fetch and parse HTML from a URL."""
    response = requests.get(url, headers=HEADERS, timeout=10)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def get_years_of_union_act(url: str) -> List[Tuple[str, int]]:
    """Extract years and document counts from the union act index page."""
    soup = fetch_soup(url)
    mainbody = soup.find("div", class_="info_indian_kanoon")
    a_tags = mainbody.table.find_all("a")
    links = [a['href'] for a in a_tags]

    count_of_docs = soup.find_all("div", class_="browselist")
    years = [
        (links[i][-5:-1], int(count_of_docs[i].a.next_sibling.strip().strip("()")))
        for i in range(len(links))
    ]
    return years


def years_to_links(years: List[Tuple[str, int]], base_url: str) -> List[str]:
    """Convert years to full links."""
    return [base_url + year for year, _ in years]


def extract_links_from_page(url: str) -> List[Dict[str, str]]:
    """Extract metadata of documents from a paginated results page."""
    links = []
    while url:
        soup = fetch_soup(url)
        results = soup.find("div", class_="results_middle").find_all("div", class_="result")

        for result in results:
            cite_tags = result.find_all("a", class_="cite_tag")
            cited_to = int(cite_tags[0].text.split()[-1]) if cite_tags else 0
            cited_by = int(cite_tags[1].text.split()[-1]) if len(cite_tags) > 1 else 0

            links.append({
                "doc_id": str(uuid.uuid4()),
                "link": BASE_URL + result.a["href"],
                "title": result.a.text,
                "docsource": result.span.text,
                "cited_to": cited_to,
                "cited_by": cited_by,
            })

        # Check if there's a "Next" page
        next_page = soup.find("div", class_="bottom").find("a", string="Next")
        url = BASE_URL + next_page['href'] if next_page else None

    return links


def organize_links(links: List[str]) -> Dict[str, List[Dict[str, str]]]:
    """Organize document links by year."""
    organized_links = {}
    for link in links:
        year = link[-4:]
        if year not in organized_links:
            organized_links[year] = extract_links_from_page(link)
    return organized_links


def attach_document_count_to_metadata(
    metadata: Dict[str, List[Dict[str, str]]],
    years: List[Tuple[str, int]]
) -> Dict[str, List[Dict[str, str]]]:
    """Attach document counts to metadata for verification."""
    metadata["document_count"] = {year: count for year, count in years}
    return metadata


def verify_document_count_in_metadata(metadata: Dict[str, List[Dict[str, str]]]) -> None:
    """Verify if document counts match expected counts."""
    for year in metadata:
        if year == "document_count":
            continue
        expected = metadata["document_count"].get(year, 0)
        found = len(metadata[year])
        if found == expected:
            print(f"✅ Document count matches for {year}")
        else:
            print(f"❌ Mismatch for {year}: expected {expected}, found {found}")


def attach_data_to_metadata(
    metadata: Dict[str, List[Dict[str, str]]],
    years: List[str]
) -> Dict[str, List[Dict[str, str]]]:
    """Fetch full document text for each case and attach it to metadata."""
    for year in years:
        for doc in metadata.get(year, []):
            link = doc["link"]
            print(f"Fetching data for {doc['title']} , year: {year}, link: {link}")

            data, success = "", False
            for attempt in range(3):  # try max 3 times
                try:
                    response = requests.get(link, headers=HEADERS, timeout=10)
                    response.raise_for_status()
                    sleep(2)  # polite delay

                    soup = BeautifulSoup(response.text, "html.parser")
                    container = soup.find("div", class_="akn-akomaNtoso")
                    if container:
                        data = container.get_text(" ", strip=True)
                        success = True
                        break
                except Exception as e:
                    print(f"Attempt {attempt+1} failed for {doc['title']} ({year}) -> {e}")
                    sleep(3)

            if not success:
                print(f"❌ Could not fetch data for {doc['title']} , year: {year}")

            doc["data"] = data

    return metadata


In [2]:
base_url="https://indiankanoon.org/search/?formInput=doctypes:un-convention%20year:"
url = "https://indiankanoon.org/browse/un-convention/"
years = get_years_of_union_act(url)
links = years_to_links(years, base_url)
metadata = organize_links(links)
attach_document_count_to_metadata(metadata, years)
verify_document_count_in_metadata(metadata)

✅ Document count matches for 1945
✅ Document count matches for 1965
✅ Document count matches for 1967
✅ Document count matches for 1976
✅ Document count matches for 1981
✅ Document count matches for 1990
✅ Document count matches for 2000
✅ Document count matches for 2008
✅ Document count matches for 2010


In [3]:
attach_data_to_metadata(metadata, [year for year, _ in years])

Fetching data for Statute Of The International Court Of Justice , year: 1945, link: https://indiankanoon.org/doc/126255256/
Fetching data for International Convention on the Elimination of All Forms of Racial Discrimination , year: 1965, link: https://indiankanoon.org/doc/135354168/
Fetching data for International Covenant on Civil and Political Rights , year: 1967, link: https://indiankanoon.org/doc/8475620/
Fetching data for International Covenant on Economic, Social and Cultural Rights , year: 1976, link: https://indiankanoon.org/doc/228316/
Fetching data for Convention on the Elimination of All Forms of Discrimination against Women , year: 1981, link: https://indiankanoon.org/doc/188255737/
Fetching data for Convention on the Rights of the Child , year: 1990, link: https://indiankanoon.org/doc/170937024/
Fetching data for Optional Protocol to the Convention on the Rights of the Child on the involvement of children in armed conflict , year: 2000, link: https://indiankanoon.org/doc/9

{'1945': [{'doc_id': '1e541098-a9c3-4974-8f93-b9992d9b9147',
   'link': 'https://indiankanoon.org/doc/126255256/',
   'title': 'Statute Of The International Court Of Justice',
   'docsource': 'United Nations Conventions',
   'cited_to': 0,
   'cited_by': 0,
   'data': "Article 1. The International Court of Justice established by the Charter of the United Nations as the principal judicial organ of the United Nations shall be constituted and shall function in accordance with the provisions of the present\xa0Statute. Chapter 1 ORGANIZATION OF THE COURT Article 2. The Court shall be composed of a body of independent judges, elected regardless of their nationality from among persons of high moral character, who possess the qualifications required in their respective countries for appointment to the highest judicial offices, or are jurisconsults of recognized competence in international law. Article 3: (1) The Court shall consist of fifteen Members, no two of whom may be nationals of the sam

In [4]:
with open("./united_nations_convection.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)