In [56]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time


In [57]:
# Create the WebDriver
driver = webdriver.Firefox()


In [58]:
def scrape_page(url):
    # Navigate to the page
    driver.get(url)

    # Wait for the page to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
  
    #  Get the page source and parse it
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    return driver, soup

In [59]:
testing_url = "https://new.kenyalaw.org/judgments/court-class/superior-courts/"
page_driver, page = scrape_page(testing_url)

In [60]:
page_driver.title


'Superior Courts - Kenya Law'

In [61]:
def years_links_extract(url, page):
    # Extracting all the years
    ul_element = page.find("ul", class_="year-nav mb-0 ms-2")
    years_links = []
    if ul_element.findAll("li"):
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                years_links.append(url + a_tag["href"].split("/")[-2] + "/")
    
    return years_links

In [62]:
years = years_links_extract(testing_url, page)

In [63]:
def months_links_extract(url, page):
    # Extracting all the years
    ul_element = page.find_all("ul", class_="year-nav mb-0 ms-2")[1]
    months_links = []
    if ul_element.findAll("li"):
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                month = a_tag["href"].split("/")[-2]
                months_links.append(url + month + "/")
    
    return months_links

In [64]:
testing_url2 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/"
page_driver2, page2 = scrape_page(testing_url2)

In [65]:
months_links_extract(testing_url2, page2)

['https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/2/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/3/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/4/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/5/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/6/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/7/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/8/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/9/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/10/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/11/',
 'https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/12/']

In [66]:
testing_url3 = "https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1"
page_driver3, page3 = scrape_page(testing_url3)

In [67]:
# Extracting page numbers
def extract_page_numbers_links(url, page):
    ul_element = page.find("ul", class_="pagination flex-wrap")
    page_numbers = []
    if ul_element:
        for li in ul_element.findAll("li"):
            a_tag = li.find("a")
            if a_tag and "href" in a_tag.attrs:
                page_numbers.append(f"{url}&{a_tag["href"][12:]}")
    
    # print("Func 1")
    # print(page_numbers)
    
    return list(set(page_numbers))

In [68]:
def exctract_alphabetical_links(url):
    alphabets = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    links = []
    
    for alphabet in alphabets:
        link = f"{url}?alphabet={alphabet}"
        links.append(link)
    
    # print("Func 2")
    # print(links)
    return links

In [69]:
def pdf_links(page):
    base_url = "https://new.kenyalaw.org"
    tr_elements = page.find_all("tr")
    links = []
    
    for tr in tr_elements:
        # Find td with class 'cell-title'
        td_title = tr.find("td", class_="cell-title")
        if td_title:
            a_tag = td_title.find("a")
            if a_tag and "href" in a_tag.attrs:
                links.append(base_url + a_tag["href"])
    
    # print("Func 3")
    # print(links)
    
    return links

In [70]:
def exctract_all_cases_links_in_a_query(url):
    all_alphabets_links = exctract_alphabetical_links(url)
    all_pdfs = []
    
    for alphabet_link in all_alphabets_links:
        print(alphabet_link)
        driver.get(alphabet_link)
        page_1 = BeautifulSoup(driver.page_source, 'html.parser')
        # will concat alphabet link and page
        # print("Printing alphabetical link", alphabet_link)``
        pages_links = extract_page_numbers_links(alphabet_link, page_1)
        
        for page_link in pages_links:
            if (page_link[-6:] == "page=1"):
                page_2 = BeautifulSoup(driver.page_source, 'html.parser')   
                pdf = pdf_links(page_2) 
                all_pdfs.append(pdf) 
                continue
            
            driver.get(page_link)
            page_2 = BeautifulSoup(driver.page_source, 'html.parser')
            pdf = pdf_links(page_2) 
            all_pdfs.append(pdf)
        
  
    
    print("Func 4")
    print(all_pdfs)
    return all_pdfs

In [71]:
exctract_all_cases_links_in_a_query("https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1")

https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=a
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=b
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=c
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=d
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=e
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=f
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=g
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=h
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=i
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=j
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=k
https://new.kenyalaw.org/judgments/court-class/superior-courts/2019/1?alphabet=l
https://new.kenyalaw.org/jud

KeyboardInterrupt: 

In [35]:
driver.quit()
driver.close()


MaxRetryError: HTTPConnectionPool(host='localhost', port=49266): Max retries exceeded with url: /session/c050c9e0-c60b-4e4c-97fb-589f3a2cf13a/window (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1068b9c70>: Failed to establish a new connection: [Errno 61] Connection refused'))