In [21]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import numpy as np
import re
import requests
import os


In [22]:
# Create the WebDriver
driver = webdriver.Firefox()


In [23]:
# Intializing the base url
base_url = "https://new.kenyalaw.org"


In [24]:
def scrape_page(url):
    # Navigate to the page
    driver.get(url)

    # Wait for the page to load
    WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
  
    #  Get the page source and parse it
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    return driver, soup

In [25]:
# testing_url = "https://new.kenyalaw.org/judgments/court-class/superior-courts/"
# page_driver, page = scrape_page(testing_url)

In [26]:
# page_driver.title


In [27]:
def years_links_extract(url, page):
    # Extracting all the years
    ul_element = page.find("ul", class_="year-nav mb-0 ms-2")
    years_links = []
    if ul_element.findAll("li"):
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                years_links.append(url + a_tag["href"].split("/")[-2] + "/")
    
    return years_links

In [28]:
# years = years_links_extract(testing_url, page)
# years

In [29]:
def months_links_extract(url, page):
    # Extracting all the years
    ul_element = page.find_all("ul", class_="year-nav mb-0 ms-2")[1]
    months_links = []
    if ul_element.findAll("li"):
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                month = a_tag["href"].split("/")[-2]
                months_links.append(url + month + "/")
    
    return months_links

In [30]:
# testing_url2 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/"
# page_driver2, page2 = scrape_page(testing_url2)

In [31]:
# months_links_extract(testing_url2, page2)

In [32]:
# testing_url3 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1"
# page_driver3, page3 = scrape_page(testing_url3)

In [33]:
# Extracting page numbers
def extract_page_numbers_links(url, page):
    ul_element = page.find("ul", class_="pagination flex-wrap")
    page_numbers = []
    if ul_element:
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                page_numbers.append(f"{url}&{a_tag["href"][12:]}")
    
    # print("Func 1")
    # print(page_numbers)
    
    return list(set(page_numbers))

In [34]:

def exctract_alphabetical_links(url):
    alphabets = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    links = []
    
    for alphabet in alphabets:
        link = f"{url}?alphabet={alphabet}"
        links.append(link)
    
    # print("Func 2")
    # print(links)
    return links

In [35]:
def pdf_links(page):
    tr_elements = page.find_all("tr")
    links = []
    
    for tr in tr_elements:
        # Find td with class 'cell-title'
        td_title = tr.find("td", class_="cell-title")
        if td_title:
            a_tag = td_title.find("a")
            if a_tag and "href" in a_tag.attrs:
                links.append(base_url + a_tag["href"])
    
    # print("Func 3")
    # print(links)
    
    return links

In [36]:

def is_pdf_size_greater_than_zero(text):
    # Use regex to find the storage size in the format (number + KB)
    match = re.search(r'(\d+(\.\d+)?)\s*KB', text)
    
    if match:
        # Extract the size as a float
        size_in_kb = float(match.group(1))
        # Return True if size is greater than 0, otherwise False
        return size_in_kb > 0
    else:
        # If no size is found, return False
        return False

# Extract pdf
def extract_pdf_link(url):
    driver.get(url)
    pdf_download_page_driver, pdf_download_page = scrape_page(url)
    
    a_element = pdf_download_page_driver.find_element(By.CSS_SELECTOR, "a.btn.btn-primary.btn-shrink-sm")
    a_tag = a_element.get_attribute("href")

    
    pdf_size_greater_than_zero = is_pdf_size_greater_than_zero(a_element.text.strip())
    
    if pdf_size_greater_than_zero:
        return a_tag
    else:
        dd_elements = pdf_download_page.find_all("dd") 
        a_tag = dd_elements[-1].find("a")
        download_link = a_tag["href"]
        return download_link
    
# extract_pdf_link("https://new.kenyalaw.org/akn/ke/judgment/kehc/1990/87/eng@1990-12-19")



In [37]:
# sorta main fun
def exctract_all_cases_links_in_a_query(url):
    all_alphabets_links = exctract_alphabetical_links(url)
    all_pdfs = []
    
    for alphabet_link in all_alphabets_links:
        # driver.get(alphabet_link)
        page_1_driver, page_1 = scrape_page(alphabet_link)
        # will concat alphabet link and page
        # print("Printing alphabetical link", alphabet_link)
        pages_links = extract_page_numbers_links(alphabet_link, page_1)
        
        for page_link in pages_links:           
            page_2_driver, page_2 = scrape_page(page_link)
            pdf_download_page_links = pdf_links(page_2)
            
            for link in pdf_download_page_links:
                pdf_link =  extract_pdf_link(link)
                all_pdfs.append(pdf_link)
            
            print(all_pdfs)
            
    return np.array(all_pdfs).flatten()

In [38]:
# exctract_all_cases_links_in_a_query("https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1")



In [39]:
def final_page_scrapper(url):
    all_downloadable_links = set([])
    # Iterate through years then months
    scraped_page_driver, scraped_page = scrape_page(url)
    
    # Year links
    years_links = years_links_extract(url, scraped_page)
    print("****************Got year links****************")
    
    for year_link in years_links:
        year_page_driver, year_page = scrape_page(year_link)
        # Month links
        months_links = months_links_extract(year_link, year_page)
        print("****************Got month links****************")
        
        for month_link in months_links:
            downloadble_links = exctract_all_cases_links_in_a_query(month_link)
            all_downloadable_links.update(np.array(downloadble_links).flatten())
          
    
    return all_downloadable_links

In [40]:
final_page_scrapper("https://new.kenyalaw.org/judgments/court-class/superior-courts/")

KeyboardInterrupt: 

In [401]:
import os
import requests
import time

def download_files(url_list, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    total_files = len(url_list)
    downloaded_files = 0
    failed_downloads = []

    print(f"Starting download of {total_files} files.")

    for url in url_list:
        filename = os.path.join(folder_path, url.split('/')[-1])
        
        if download_file(url, filename):
            downloaded_files += 1
        else:
            failed_downloads.append(url)
        
        print(f"Files remaining: {total_files - downloaded_files}")

    if failed_downloads:
        print(f"\nAttempting to redownload {len(failed_downloads)} failed files...")
        for url in failed_downloads[:]:  # Create a copy to iterate over
            filename = os.path.join(folder_path, url.split('/')[-1])
            if download_file(url, filename):
                downloaded_files += 1
                failed_downloads.remove(url)

    print(f"\nDownload complete. {downloaded_files} files downloaded successfully.")
    if failed_downloads:
        print(f"Failed to download {len(failed_downloads)} files:")
        for url in failed_downloads:
            print(url)

def download_file(url, filename):
    try:
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded: {filename}")
            return True
        else:
            print(f"Failed to download: {url}")
            return False
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return False


urls = ['https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/407/eng@2024-01-31/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/594/eng@2024-01-31/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/384/eng@2024-01-31/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/5163/eng@2024-01-31/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/455/eng@2024-01-31/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/297/eng@2024-01-31/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/812/eng@2024-01-31/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/928/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/314/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/336/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/597/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/608/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/308/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/504/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/276/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/599/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/2333/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/217/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/309/eng@2024-01-30/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/204/eng@2024-01-29/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/615/eng@2024-01-29/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/434/eng@2024-01-26/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/556/eng@2024-01-26/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/336/eng@2024-01-26/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/135/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/525/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/233/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/488/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelrc/2024/39/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/keca/2024/673/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/409/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelc/2024/169/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/653/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/644/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/254/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/384/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/407/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/692/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/keca/2024/49/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/keelrc/2024/46/eng@2024-01-25/source']

download_folder = "downloaded_files"
download_files(urls, download_folder)
        

Starting download of 40 files.
Downloaded: downloaded_files/source
Files remaining: 39
Downloaded: downloaded_files/source
Files remaining: 38
Downloaded: downloaded_files/source
Files remaining: 37
Downloaded: downloaded_files/source
Files remaining: 36
Downloaded: downloaded_files/source
Files remaining: 35
Downloaded: downloaded_files/source
Files remaining: 34
Downloaded: downloaded_files/source
Files remaining: 33
Downloaded: downloaded_files/source
Files remaining: 32
Downloaded: downloaded_files/source
Files remaining: 31
Downloaded: downloaded_files/source
Files remaining: 30
Downloaded: downloaded_files/source
Files remaining: 29
Downloaded: downloaded_files/source
Files remaining: 28


KeyboardInterrupt: 

In [93]:
driver.close()
driver.quit()
