In [77]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import numpy as np
import re
import requests
import os


In [78]:
# Create the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))


In [79]:
# Intializing the base url
base_url = "https://new.kenyalaw.org"


In [80]:
def scrape_page(url):
    # Navigate to the page
    driver.get(url)

    # Wait for the page to load
    WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
  
    #  Get the page source and parse it
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    return driver, soup

In [81]:
# testing_url = "https://new.kenyalaw.org/judgments/court-class/superior-courts/"
# page_driver, page = scrape_page(testing_url)

In [82]:
# page_driver.title


In [83]:
def years_links_extract(url, page):
    # Extracting all the years
    ul_element = page.find("ul", class_="year-nav mb-0 ms-2")
    years_links = []
    if ul_element.findAll("li"):
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                years_links.append(url + a_tag["href"].split("/")[-2] + "/")
    
    return years_links

In [84]:
# years = years_links_extract(testing_url, page)
# years

In [85]:
def months_links_extract(url, page):
    # Extracting all the years
    ul_element = page.find_all("ul", class_="year-nav mb-0 ms-2")[1]
    months_links = []
    if ul_element.findAll("li"):
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                month = a_tag["href"].split("/")[-2]
                months_links.append(url + month + "/")
    
    return months_links

In [86]:
# testing_url2 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/"
# page_driver2, page2 = scrape_page(testing_url2)

In [87]:
# months_links_extract(testing_url2, page2)

In [88]:
# testing_url3 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1"
# page_driver3, page3 = scrape_page(testing_url3)

In [89]:
# Extracting page numbers
def extract_page_numbers_links(url, page):
    ul_element = page.find("ul", class_="pagination flex-wrap")
    page_numbers = []
    if ul_element:
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                page_numbers.append(f"{url}&{a_tag['href'][12:]}")

    
    # print("Func 1")
    # print(page_numbers)
    
    return list(set(page_numbers))

In [90]:

def exctract_alphabetical_links(url):
    alphabets = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    links = []
    
    for alphabet in alphabets:
        link = f"{url}?alphabet={alphabet}"
        links.append(link)
    
    # print("Func 2")
    # print(links)
    return links

In [91]:
def pdf_links(page):
    tr_elements = page.find_all("tr")
    links = []
    
    for tr in tr_elements:
        # Find td with class 'cell-title'
        td_title = tr.find("td", class_="cell-title")
        if td_title:
            a_tag = td_title.find("a")
            if a_tag and "href" in a_tag.attrs:
                links.append(base_url + a_tag["href"])
    
    # print("Func 3")
    # print(links)
    
    return links

In [92]:

def is_pdf_size_greater_than_zero(text):
    # Use regex to find the storage size in the format (number + KB)
    match = re.search(r'(\d+(\.\d+)?)\s*KB', text)
    
    if match:
        # Extract the size as a float
        size_in_kb = float(match.group(1))
        # Return True if size is greater than 0, otherwise False
        return size_in_kb > 0
    else:
        # If no size is found, return False
        return False

# Extract pdf
def extract_pdf_link(url):
    driver.get(url)
    pdf_download_page_driver, pdf_download_page = scrape_page(url)
    
    a_element = pdf_download_page_driver.find_element(By.CSS_SELECTOR, "a.btn.btn-primary.btn-shrink-sm")
    a_tag = a_element.get_attribute("href")

    
    pdf_size_greater_than_zero = is_pdf_size_greater_than_zero(a_element.text.strip())
    
    if pdf_size_greater_than_zero:
        return a_tag
    else:
        dd_elements = pdf_download_page.find_all("dd") 
        a_tag = dd_elements[-1].find("a")
        download_link = a_tag["href"]
        return download_link
    
# extract_pdf_link("https://new.kenyalaw.org/akn/ke/judgment/kehc/1990/87/eng@1990-12-19")


from urllib.parse import urlparse
from datetime import datetime

def download_pdf(url, folder="downloaded_files"):
    # Create the folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    # Extract the filename from the URL, default to "downloaded_file" if empty
    filename = os.path.basename(urlparse(url).path) or "downloaded_file"
    
    # If the filename has no extension, assume it's a PDF
    if not filename.endswith(".pdf"):
        filename += ".pdf"
    
    # Create a unique filename if the file already exists
    base_filename, extension = os.path.splitext(filename)
    unique_filename = filename
    counter = 1
    
    while os.path.exists(os.path.join(folder, unique_filename)):
        # If a file with the same name exists, add a counter or timestamp
        unique_filename = f"{base_filename}_{counter}{extension}"
        counter += 1
    
    # Define the final file path
    file_path = os.path.join(folder, unique_filename)
    
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Write the content to a PDF file in the specified folder
            with open(file_path, "wb") as pdf_file:
                pdf_file.write(response.content)
            
            print(f"Downloaded: {file_path}")
        else:
            print(f"Failed to download {url}. Status code: {response.status_code}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

In [93]:
# sorta main fun
def exctract_all_cases_links_in_a_query(url):
    all_alphabets_links = exctract_alphabetical_links(url)
    all_pdfs = []
    
    for alphabet_link in all_alphabets_links:
        # driver.get(alphabet_link)
        page_1_driver, page_1 = scrape_page(alphabet_link)
        # will concat alphabet link and page
        # print("Printing alphabetical link", alphabet_link)
        pages_links = extract_page_numbers_links(alphabet_link, page_1)
        
        for page_link in pages_links:           
            page_2_driver, page_2 = scrape_page(page_link)
            pdf_download_page_links = pdf_links(page_2)
            
            for link in pdf_download_page_links:
                pdf_link =  extract_pdf_link(link)
                #fn to download file from pdf link
                download_pdf(pdf_link)
                all_pdfs.append(pdf_link)
            
            print(all_pdfs)
            
    return np.array(all_pdfs).flatten()

In [94]:
# exctract_all_cases_links_in_a_query("https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1")



In [95]:
import os
import requests
import numpy as np

#define all_downloadable_links
all_downloadable_links = []


# Existing functions remain the same: `download_files`, `download_file`, etc.

def final_page_scrapper(url, download_folder="downloaded_files"):
    # Ensure download folder exists
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
        
    # Set to store all unique downloadable links
    all_downloadable_links = set()
    
    # Start scraping
    scraped_page_driver, scraped_page = scrape_page(url)
    years_links = years_links_extract(url, scraped_page)
    print("****************Got year links****************")
    
    for year_link in years_links:
        year_page_driver, year_page = scrape_page(year_link)
        months_links = months_links_extract(year_link, year_page)
        print("****************Got month links****************")
        
        for month_link in months_links:
            downloadble_links = exctract_all_cases_links_in_a_query(month_link)
            
            # Convert to list, flatten and iterate through each link
            for link in np.array(downloadble_links).flatten():
                # Skip if link already downloaded
                if link in all_downloadable_links:
                    continue
                #Add link to the set of all downloadable links
                all_downloadable_links.add(link)
                
    return all_downloadable_links
                 


final_page_scrapper("https://new.kenyalaw.org/judgments/KEHC/HCKSI/")

****************Got year links****************
****************Got month links****************
Downloaded: downloaded_files\source.pdf
['https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/407/eng@2024-01-25/source']
Downloaded: downloaded_files\source_1.pdf
['https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/407/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/436/eng@2024-01-25/source']
Downloaded: downloaded_files\source_2.pdf
['https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/407/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/436/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/1364/eng@2024-01-24/source']
Downloaded: downloaded_files\source_3.pdf
Downloaded: downloaded_files\source_4.pdf
['https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/407/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/436/eng@2024-01-25/source', 'https://new.kenyalaw.org/akn/ke/judgment/kehc/2024/

KeyboardInterrupt: 

In [None]:
driver.close()
driver.quit()
