In [None]:
import csv
import sys
import time
from collections import deque
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

START_URL = "https://en.adgm.thomsonreuters.com/rulebook/adgm-courts-legislation"   # <-- set this
MAX_DEPTH = 10                     # <-- set this (0 = only the page itself)
SAME_DOMAIN_ONLY = True             # True to restrict to the start domain
TIMEOUT = 10
HEADERS = {"User-Agent": "PDF-Link-Crawler/1.0"}

def in_scope(url, start_netloc):
    try:
        p = urlparse(url)
        if p.scheme not in ("http", "https"):
            return False
        if SAME_DOMAIN_ONLY and p.netloc != start_netloc:
            return False
        return True
    except Exception:
        return False

def is_pdf_url(url):
    # Catch common patterns; you can also HEAD-check content-type if needed
    return url.lower().split("?")[0].endswith(".pdf")

def normalize(url):
    # Drop fragments
    u = url.split("#")[0].strip()
    return u

def crawl(start_url, max_depth):
    start_netloc = urlparse(start_url).netloc
    visited_pages = set()
    found_pdfs = set()

    q = deque([(start_url, 0)])

    while q:
        url, depth = q.popleft()
        if url in visited_pages or depth > max_depth:
            continue
        visited_pages.add(url)

        try:
            resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            content_type = resp.headers.get("Content-Type", "").lower()
            if "pdf" in content_type:
                found_pdfs.add(normalize(url))
                continue
            if "text/html" not in content_type:
                continue
            soup = BeautifulSoup(resp.text, "html.parser")
        except Exception:
            continue

        for a in soup.find_all("a", href=True):
            href = urljoin(url, a["href"])
            href = normalize(href)
            if not in_scope(href, start_netloc):
                continue

            if is_pdf_url(href):
                found_pdfs.add(href)
            else:
                if href not in visited_pages and depth < max_depth:
                    q.append((href, depth + 1))

        # Be polite
        time.sleep(0.2)

    return sorted(found_pdfs)

def main():
    pdfs = crawl(START_URL, MAX_DEPTH)
    for u in pdfs:
        print(u)
    # Also save to CSV
    with open("pdf_links.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["pdf_url"])
        for u in pdfs:
            writer.writerow([u])
    print(f"\nSaved {len(pdfs)} links to pdf_links.csv", file=sys.stderr)

if __name__ == "__main__":
    main()


In [1]:
import requests
import os
import csv

def download_pdf(url, save_dir="downloads"):
    try:
        # Create folder if it doesn't exist
        os.makedirs(save_dir, exist_ok=True)
        
        # Extract filename from URL
        file_name = url.split("/")[-1]
        if not file_name.endswith(".pdf"):
            file_name += ".pdf"
        
        file_path = os.path.join(save_dir, file_name)
        
        # Download the PDF
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                f.write(chunk)
        
        print(f"✅ Downloaded: {file_name}")
    
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to download {url} - {e}")

def download_from_csv(csv_file):
    with open(csv_file, newline='', encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if row:  # Skip empty rows
                url = row[0].strip()  # Assuming PDF link is in the first column
                download_pdf(url)

# Example usage
csv_file_path = "pdf_links.csv"  # Replace with your CSV file path
download_from_csv(csv_file_path)


❌ Failed to download pdf_url - Invalid URL 'pdf_url': No scheme supplied. Perhaps you meant https://pdf_url?
✅ Downloaded: ADGM_Court_Procedure_Rules_Amendment_No1_of_2017.pdf
✅ Downloaded: ADGM_Court_Procedure_Rules_Amendment_No_1_of_2018.pdf
✅ Downloaded: ADGM_Court_Procedure_Rules_Amendment_No_1_of_2019.pdf
✅ Downloaded: ADGM_Court_Procedure_Rules_Amendment_No_2_of_2017.pdf
✅ Downloaded: ADGM_Courts_Litigation_Funding_Rules.pdf
✅ Downloaded: ADGM_Courts_Regulations_2015_Amended_18.pdf
✅ Downloaded: ADGM_Courts_Regulations_Amendment_No_1_of_2017.pdf
✅ Downloaded: ADGM_Courts_Regulations_Amendment_No_1_of_2018.pdf
✅ Downloaded: ADGM_Courts_Rules_of_Conduct.pdf
✅ Downloaded: ADGM_REGISTRAR_S_ELECTRONIC_FORM_RULES_2015.pdf
✅ Downloaded: Application_of_English_Law_Amendment_Regulations_2016.pdf
✅ Downloaded: Certification_of_Enforcement_Agents_Rules_17Dec2015.pdf
✅ Downloaded: Companies_Amendment_No_2_Regulations_2016.pdf
✅ Downloaded: Companies_Amendment_Regulations_2015.pdf
✅ Downloade

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings

In [22]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()
genai.configure(api_key="AIzaSyBXb0FRTMvwRyj2oZ_JjCnbjOqz9tevVS8")

for m in genai.list_models():
    print(m.name)


models/embedding-gecko-001
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash
models/gemini-2.5-flash-lite-preview-06-17
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/ge

In [25]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os

def chatbot_chain():
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0,
        google_api_key="AIzaSyBXb0FRTMvwRyj2oZ_JjCnbjOqz9tevVS8")  # ✅ Correct param name
    
    return llm

if __name__ == "__main__":
    llm = chatbot_chain()
    response = llm.invoke("Hello Gemini, tell me a fact about AI.")
    print(response.content)

Of course. Here's a foundational fact:

The term "Artificial Intelligence" was coined in **1956** by computer scientist John McCarthy for a proposal for the Dartmouth Summer Research Project on Artificial Intelligence. This workshop is widely considered the founding event of AI as a field of research.
