In [None]:
! pip install -q requests beautifulsoup4 selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import json
import re

In [None]:
# Initialize WebDriver
driver = webdriver.Chrome()

driver.get("https://digiscr.sci.gov.in/")

start_year = 1950
end_year = 1951

wait = WebDriverWait(driver, 20)
for desired_year in range(start_year, end_year):
    desired_year_str = str(desired_year)
    # Wait for the year dropdown to be loaded
    year_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'year')))

    # Select the year from the dropdown
    Select(year_dropdown).select_by_value(desired_year_str)

    time.sleep(10) 

    # Wait for the volume dropdown to be populated and ready
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#volume option')))

    # Retrieve the volume dropdown element
    volume_dropdown = driver.find_element(By.ID, 'volume')

    # Iterate through each volume and select it
    # Retrieve the total number of volumes initially to know how many times to iterate
    initial_volumes = driver.find_elements(By.CSS_SELECTOR, "#volume option")
    total_volumes = len([opt for opt in initial_volumes if opt.get_attribute('value').strip()])

    for i in range(total_volumes):
        cases_details = []
        # Re-fetch the volume dropdown and options on each iteration to avoid stale references
        volume_dropdown = driver.find_element(By.ID, 'volume')
        volumes = volume_dropdown.find_elements(By.TAG_NAME, 'option')
        time.sleep(10)

        # Skip placeholder or non-relevant options
        if volumes[i].get_attribute('value').strip():
            volume_text = volumes[i].text.strip()  # Use text to avoid whitespace issues
            print(f"Selecting Volume: {volume_text}")
            volumes[i].click()  # Click the option directly

            # Wait for a short period to allow the AJAX call to initiate and complete
            time.sleep(3)  # Adjust based on actual site response time

            # Example: Wait for and print total records text after selecting each volume
            records_span = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.records span")))
            total_records_text = records_span.text
            print(total_records_text)
            # After selecting 'Part II', wait for the relevant data to load if needed
            # Now, locate the <ul> with class 'linking-section' and count the <li> elements
            ul_element = wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "linking-section"))
            )
            li_elements = ul_element.find_elements(By.TAG_NAME, 'li')
            total_li_count = len(li_elements)
            print(f"Total <li> elements within <ul class='linking-section'>: {total_li_count}")

            for li in li_elements:
                # Extracting the case name and link
                case_link_element = li.find_element(By.CSS_SELECTOR, 'div.cite-data a')
                case_name = case_link_element.text
                
                onclick_attribute = case_link_element.get_attribute('onclick')
                if onclick_attribute:
                    match = re.search(r"view_judgment\('([^']+)'", onclick_attribute)
                    if match:
                        case_id = match.group(1)
                        case_link = f"https://digiscr.sci.gov.in/view_judgment?id={case_id}"
                
                
                citation_element = li.find_element(By.CSS_SELECTOR, '.cititaion span')
                citation = citation_element.text if citation_element else "No citation available"

                # Extracting the date and case type
                date_and_case_type_elements = li.find_elements(By.CSS_SELECTOR, '.civil p')
                case_type = date_and_case_type_elements[0].text if len(date_and_case_type_elements) > 0 else "No case type available"
                date = date_and_case_type_elements[1].text if len(date_and_case_type_elements) > 1 else "No date available"

                # Extracting judge names
                judge_names_elements = li.find_elements(By.CSS_SELECTOR, '.entryjudgment span')
                judge_names = ', '.join([judge.text for judge in judge_names_elements])

                # Extracting PDF link
                pdf_link_element = li.find_element(By.CSS_SELECTOR, 'a[href*="pdf_viewer"]')
                pdf_link = pdf_link_element.get_attribute('href') if pdf_link_element else "No PDF link available"

            # Print extracted information
                cases_details.append({
                    'volume': desired_year_str + volume_text,
                    'Case Name': case_name,
                    'Case Link': case_link,
                    'Citation': citation,
                    'Case Type': case_type,
                    'Date': date,
                    'Judge Names': judge_names,
                    'PDF Link': pdf_link
                })

            with open(f'{desired_year_str}_{volume_text}.json', 'w') as f:
                json.dump(cases_details, f, indent=4)

# Don't forget to close the browser when done
driver.quit()