In [1]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from typing import List, Tuple, Dict
import uuid
import json

BASE_URL = "https://indiankanoon.org"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0 Safari/537.36"
    )
}


def fetch_soup(url: str) -> BeautifulSoup:
    """Fetch and parse HTML from a URL."""
    response = requests.get(url, headers=HEADERS, timeout=10)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def get_years_of_union_act(url: str) -> List[Tuple[str, int]]:
    """Extract years and document counts from the union act index page."""
    soup = fetch_soup(url)
    mainbody = soup.find("div", class_="info_indian_kanoon")
    a_tags = mainbody.table.find_all("a")
    links = [a['href'] for a in a_tags]

    count_of_docs = soup.find_all("div", class_="browselist")
    years = [
        (links[i][-5:-1], int(count_of_docs[i].a.next_sibling.strip().strip("()")))
        for i in range(len(links))
    ]
    return years


def years_to_links(years: List[Tuple[str, int]], base_url: str) -> List[str]:
    """Convert years to full links."""
    return [base_url + year for year, _ in years]


def extract_links_from_page(url: str) -> List[Dict[str, str]]:
    """Extract metadata of documents from a paginated results page."""
    links = []
    while url:
        soup = fetch_soup(url)
        results = soup.find("div", class_="results_middle").find_all("div", class_="result")

        for result in results:
            cite_tags = result.find_all("a", class_="cite_tag")
            cited_to = int(cite_tags[0].text.split()[-1]) if cite_tags else 0
            cited_by = int(cite_tags[1].text.split()[-1]) if len(cite_tags) > 1 else 0

            links.append({
                "doc_id": str(uuid.uuid4()),
                "link": BASE_URL + result.a["href"],
                "title": result.a.text,
                "docsource": result.span.text,
                "cited_to": cited_to,
                "cited_by": cited_by,
            })

        # Check if there's a "Next" page
        next_page = soup.find("div", class_="bottom").find("a", string="Next")
        url = BASE_URL + next_page['href'] if next_page else None

    return links


def organize_links(links: List[str]) -> Dict[str, List[Dict[str, str]]]:
    """Organize document links by year."""
    organized_links = {}
    for link in links:
        year = link[-4:]
        if year not in organized_links:
            organized_links[year] = extract_links_from_page(link)
    return organized_links


def attach_document_count_to_metadata(
    metadata: Dict[str, List[Dict[str, str]]],
    years: List[Tuple[str, int]]
) -> Dict[str, List[Dict[str, str]]]:
    """Attach document counts to metadata for verification."""
    metadata["document_count"] = {year: count for year, count in years}
    return metadata


def verify_document_count_in_metadata(metadata: Dict[str, List[Dict[str, str]]]) -> None:
    """Verify if document counts match expected counts."""
    for year in metadata:
        if year == "document_count":
            continue
        expected = metadata["document_count"].get(year, 0)
        found = len(metadata[year])
        if found == expected:
            print(f"✅ Document count matches for {year}")
        else:
            print(f"❌ Mismatch for {year}: expected {expected}, found {found}")


def attach_data_to_metadata(
    metadata: Dict[str, List[Dict[str, str]]],
    years: List[str]
) -> Dict[str, List[Dict[str, str]]]:
    """Fetch full document text for each case and attach it to metadata."""
    for year in years:
        for doc in metadata.get(year, []):
            link = doc["link"]
            print(f"Fetching data for {doc['title']} , year: {year}, link: {link}")

            data, success = "", False
            for attempt in range(3):  # try max 3 times
                try:
                    response = requests.get(link, headers=HEADERS, timeout=10)
                    response.raise_for_status()
                    sleep(2)  # polite delay

                    soup = BeautifulSoup(response.text, "html.parser")
                    container = soup.find("div", class_="akn-akomaNtoso")
                    if container:
                        data = container.get_text(" ", strip=True)
                        success = True
                        break
                except Exception as e:
                    print(f"Attempt {attempt+1} failed for {doc['title']} ({year}) -> {e}")
                    sleep(3)

            if not success:
                print(f"❌ Could not fetch data for {doc['title']} , year: {year}")

            doc["data"] = data

    return metadata


In [2]:
base_url="https://indiankanoon.org/search/?formInput=doctypes:treaty-act%20year:"
url = "https://indiankanoon.org/browse/treaty-act/"
years = get_years_of_union_act(url)
links = years_to_links(years, base_url)
metadata = organize_links(links)
attach_document_count_to_metadata(metadata, years)
verify_document_count_in_metadata(metadata)

✅ Document count matches for 1800
✅ Document count matches for 1883
✅ Document count matches for 1886
✅ Document count matches for 1939
✅ Document count matches for 1960
✅ Document count matches for 1963
✅ Document count matches for 1971
✅ Document count matches for 1972
✅ Document count matches for 1975
✅ Document count matches for 1981
✅ Document count matches for 1992
✅ Document count matches for 1993
✅ Document count matches for 1994
✅ Document count matches for 1996
✅ Document count matches for 1998
✅ Document count matches for 1999
✅ Document count matches for 2000
✅ Document count matches for 2002
✅ Document count matches for 2003
✅ Document count matches for 2004
✅ Document count matches for 2005
✅ Document count matches for 2006
✅ Document count matches for 2007
✅ Document count matches for 2008
✅ Document count matches for 2009
✅ Document count matches for 2010
✅ Document count matches for 2011
✅ Document count matches for 2012
✅ Document count matches for 2013
✅ Document cou

In [3]:
attach_data_to_metadata(metadata, [year for year, _ in years])

Fetching data for Agreement between the Government of the Republic of India and the Government of the Russian Federation for the avoidance of double taxation with respect to taxes on income , year: 1800, link: https://indiankanoon.org/doc/12252702/
Fetching data for Paris Convention for the Protection of Industrial Property , year: 1883, link: https://indiankanoon.org/doc/107184611/
Fetching data for The Berne Convention for the Protection of Literary and Artistic Works , year: 1886, link: https://indiankanoon.org/doc/198019798/
Fetching data for The United Kingdom-India Trade Agreement Rules, 1939 , year: 1939, link: https://indiankanoon.org/doc/172177741/
Fetching data for The Geneva Conventions Act, 1960 , year: 1960, link: https://indiankanoon.org/doc/188800991/
Fetching data for The Agreement between the Republic of India and the Republic of Austria for the avoidance of Double Taxation with respect to taxes on income , year: 1963, link: https://indiankanoon.org/doc/50260450/
Fetch

{'1800': [{'doc_id': '5212d3a3-b212-42bf-aac2-19ae3ac22b72',
   'link': 'https://indiankanoon.org/doc/12252702/',
   'title': 'Agreement between the Government of the Republic of India and the Government of the Russian Federation for the avoidance of double taxation with respect to taxes on income',
   'docsource': 'International Treaty - Act',
   'cited_to': 0,
   'cited_by': 0,
   'data': 'Agreement between the Government of the Republic of India and the Government of the Russian Federation for the avoidance of double taxation with respect to taxes on income Published vide Notification No. G.S.R. 507(E), dated 21st, August, 1998 Ministry of Finance (Deportment of Revenue) Central Board of Direct Taxes (Foreign Tax Division) Income-Tax G.S.R. 507(E). - Whereas the annexed agreement between the Government of the Russian Federation and the Government of the Republic of India for the avoidance of double taxation with regard to taxes on income has entered into force on the eleventh day of

In [6]:
response = requests.get('https://indiankanoon.org/doc/136969192/', headers=HEADERS, timeout=10)
soup = BeautifulSoup(response.text, "html.parser")
container = soup.find("div", class_="akn-akomaNtoso")
data = container.get_text(" ", strip=True)
data

"Agreement for the Surrender of Fugitive Offenders between the Government of the Republic of India and the Government of Hongkong Published vide Notification No. G.S.R. 275(E), dated 20th April, 1999 Ministry of External Affairs Order G.S.R. 275(E). - Who is the Agreement for the Surrender of Fugitive Offenders between the Government of the Republic of India and the Government of Hongkong was signed in Hong Kong on 28th June, 1997, and whereas the Later notification (From Hongkong side) as to the fulfilment of requirements for the entry into force of the Agreement was received on 15th October, 1997 and in terms of Article 17(1) entered into force on 14th November, 1997 and which Agreement provides as follows:- Article 1 Obligation to Surrender The parties agree to surrender to each other, subject to the provisions laid down in this Agreement, any person who is found in the jurisdiction of the requested party and who is wanted by the requesting party for prosecution or for the impositio

[{'doc_id': 'e6de5487-21cf-4a54-8606-a0525f061ff1',
  'link': 'https://indiankanoon.org/doc/55591742/',
  'title': 'International Copyright Order, 1999',
  'docsource': 'International Treaty - Act',
  'cited_to': 0,
  'cited_by': 0,
  'data': "International Copyright Order, 1999 Published vide S.O. 228(E), dated 24.3.1999, published in the Gazette of India, Ext., Pt. II, Section 3(i) dated 6.4.1999. 7. /413 In exercise of the powers conferred by section 40 of the Copyright Act, 1957 (14 of 1957), and in supersession of the International Copyright Order, 1991, the Central Government hereby makes the following Order, namely:\x97 1. (1) This Order may be called The International Copyright Order, 1999. (2) It shall come into force on the date of its publication in the Official Gazette. 2. In this Order, unless the context otherwise requires,\x97 (a) \x93Berne Convention Country\x94 means a country which is a member of the Berne Copyright Union, and includes a country mentioned either in Pa

In [9]:
with open("./international_treaty_act.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)