In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import numpy as np
import re
import requests
import os


In [None]:
# Create the WebDriver
driver = webdriver.Firefox()


In [None]:
# Intializing the base url
base_url = "https://new.kenyalaw.org"


In [None]:
def scrape_page(url):
    driver.get(url)
    WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return driver, soup

In [None]:
# testing_url = "https://new.kenyalaw.org/judgments/court-class/superior-courts/"
# page_driver, page = scrape_page(testing_url)

In [None]:
# page_driver.title


In [None]:
def years_links_extract(url, page):
    ul_element = page.find("ul", class_="year-nav mb-0 ms-2")
    years_links = []
    if ul_element.findAll("li"):
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                years_links.append(url + a_tag["href"].split("/")[-2] + "/")
    return years_links

In [None]:
# years = years_links_extract(testing_url, page)
# years

In [None]:
from urllib.parse import urljoin

def months_links_extract(url, page):
    ul_elements = page.find_all("ul", class_="year-nav mb-0 ms-2")
    
    # Ensure at least two elements exist before accessing the second
    if len(ul_elements) < 2:
        print(f"Warning: Expected at least 2 'ul' elements, found {len(ul_elements)} on page: {url}")
        return []  # Return empty list if structure doesn't match
    
    ul_element = ul_elements[1]
    months_links = []
    
    # Extract month links from the `ul_element`
    for li_tag in ul_element.find_all("li"):
        a_tag = li_tag.find("a")
        if a_tag and a_tag.get("href"):
            # Ensure URLs are fully qualified
            full_link = urljoin(url, a_tag["href"])
            months_links.append(full_link)
    
    return months_links


In [None]:
# testing_url2 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/"
# page_driver2, page2 = scrape_page(testing_url2)

In [None]:
# months_links_extract(testing_url2, page2)

In [None]:
# testing_url3 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1"
# page_driver3, page3 = scrape_page(testing_url3)

In [None]:

# Extracting page numbers
def extract_page_numbers_links(url, page):
    ul_element = page.find("ul", class_="pagination flex-wrap")
    page_numbers = []
    if ul_element:
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                page_numbers.append(f"{url}&{a_tag['href'][12:]}")
    return list(set(page_numbers))

In [None]:

def exctract_alphabetical_links(url):
    alphabets = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    links = []
    for alphabet in alphabets:
        link = f"{url}?alphabet={alphabet}"
        links.append(link)
    return links

In [None]:
def pdf_links(page):
    tr_elements = page.find_all("tr")
    links = []
    for tr in tr_elements:
        td_title = tr.find("td", class_="cell-title")
        if td_title:
            a_tag = td_title.find("a")
            if a_tag and "href" in a_tag.attrs:
                links.append(base_url + a_tag["href"])
    return links

In [None]:
def is_pdf_size_greater_than_zero(text):
    match = re.search(r'(\d+(\.\d+)?)\s*KB', text)
    if match:
        size_in_kb = float(match.group(1))
        return size_in_kb > 0
    else:
        return False

def extract_pdf_link(url):
    driver.get(url)
    pdf_download_page_driver, pdf_download_page = scrape_page(url)
    a_element = pdf_download_page_driver.find_element(By.CSS_SELECTOR, "a.btn.btn-primary.btn-shrink-sm")
    a_tag = a_element.get_attribute("href")
    pdf_size_greater_than_zero = is_pdf_size_greater_than_zero(a_element.text.strip())
    
    if pdf_size_greater_than_zero:
        return a_tag
    else:
        dd_elements = pdf_download_page.find_all("dd") 
        a_tag = dd_elements[-1].find("a")
        download_link = a_tag["href"]
        return download_link
    
# extract_pdf_link("https://new.kenyalaw.org/akn/ke/judgment/kehc/1990/87/eng@1990-12-19")



In [None]:
def exctract_all_cases_links_in_a_query(url):
    all_alphabets_links = exctract_alphabetical_links(url)
    all_pdfs = []
    for alphabet_link in all_alphabets_links:
        page_1_driver, page_1 = scrape_page(alphabet_link)
        pages_links = extract_page_numbers_links(alphabet_link, page_1)
        
        for page_link in pages_links:           
            page_2_driver, page_2 = scrape_page(page_link)
            pdf_download_page_links = pdf_links(page_2)
            for link in pdf_download_page_links:
                pdf_link = extract_pdf_link(link)
                all_pdfs.append(pdf_link)
    
    print(f"Total PDFs found: {len(all_pdfs)}")
    return np.array(all_pdfs).flatten()

In [None]:
def download_files(url_list, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    total_files = len(url_list)
    downloaded_files = 0
    failed_downloads = []

    print(f"Starting download of {total_files} files.")

    for url in url_list:
        print(f"Preparing to download: {url}")
        # Extract a valid filename from the URL, replacing unwanted parts
        filename = os.path.join(folder_path, url.split('/')[-1].replace("source", "").replace(" ", "_") + ".pdf")
        
        # Download the file and check success
        success = download_file(url, filename)
        if success:
            downloaded_files += 1
        else:
            failed_downloads.append(url)

        print(f"Files remaining: {total_files - downloaded_files}")

    # Attempt to redownload failed files
    if failed_downloads:
        print(f"\nAttempting to redownload {len(failed_downloads)} failed files...")
        for url in failed_downloads[:]:  # Create a copy to iterate over
            filename = os.path.join(folder_path, url.split('/')[-1].replace("source", "").replace(" ", "_") + ".pdf")
            if download_file(url, filename):
                downloaded_files += 1
                failed_downloads.remove(url)

    print(f"\nDownload complete. {downloaded_files} files downloaded successfully.")
    if failed_downloads:
        print(f"Failed to download {len(failed_downloads)} files:")
        for url in failed_downloads:
            print(url)

def download_file(url, filename):
    try:
        print(f"Downloading: {url}")  # Log the URL being downloaded
        response = requests.get(url, timeout=30)
        
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            print(f"Successfully downloaded: {filename}")
            return True
        else:
            print(f"Failed to download {url}: Status code {response.status_code}")
            return False
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return False



In [None]:
def final_page_scrapper(url):
    all_downloadable_links = set([])

    # Scrape the main page
    scraped_page_driver, scraped_page = scrape_page(url)

    # Year links
    years_links = years_links_extract(url, scraped_page)
    print("****************Got year links****************")

    for year_link in years_links:
        year_page_driver, year_page = scrape_page(year_link)

        # Month links
        months_links = months_links_extract(year_link, year_page)
        print("****************Got month links****************")

        for month_link in months_links:
            # Extract all the case links for the current month
            downloadable_links = exctract_all_cases_links_in_a_query(month_link)
            all_downloadable_links.update(np.array(downloadable_links).flatten())
            
            # Download all PDFs immediately after scraping month links
            download_folder = "downloaded_files"  # Define your download folder
            download_files(downloadable_links, download_folder)  # Call the download function

    return all_downloadable_links

# Call the function to start scraping
final_page_scrapper("https://new.kenyalaw.org/judgments/KESC/")

In [None]:
driver.close()
driver.quit()
