In [None]:
# Installing libraries
#! pip install requests
#! pip install beautifulsoup4
# !pip install PyPDF2

In [None]:
# loading libraries
from PyPDF2 import PdfReader
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import os
import re

In [None]:
def scrape_kenya_gazettes():
    """
    Scrapes all gazette information from the Kenya Law website for the latest date
    and processes each PDF content.
    
    Returns:
        tuple: Contains (date_string, df)
            - date_string (str): Publication date as string
            - df (pd.DataFrame): DataFrame containing all gazette information
    """
    # Specify url and get initial page
    base_url = "https://new.kenyalaw.org/gazettes/"
    try:
        page = requests.get(base_url)
        page.raise_for_status()
        soup = BeautifulSoup(page.text, 'html.parser')
        
        # Get current year and year page
        year = soup.find(lambda tag: tag.string and tag.string.startswith("20")).get_text()
        url_year = f"{base_url}{year}"
        
        page_year = requests.get(url_year)
        page_year.raise_for_status()
        soup = BeautifulSoup(page_year.text, 'html.parser')
        
        # Extract the latest date
        date_string = soup.find(
            lambda tag: tag.name == 'td' and 
            tag.get('class') == ['cell-date'] and 
            tag.string and 
            tag.string.strip()
        ).get_text()

        date_string = "22 November 2024"

        date = datetime.strptime(date_string, '%d %B %Y')
        
        # Find all rows and extract data for matching date
        rows = soup.find_all('tr')
        data = []
        
        for row in rows:
            date_cell = row.find('td', class_='cell-date')
            
            # Check if date matches and row has title
            if (date_cell and 
                date_cell.text.strip() == date_string and 
                row.find('td', class_='cell-title')):
                
                title_cell = row.find('td', class_='cell-title')
                cells = row.find_all('td')
                
                # Extract data for each gazette
                title = title_cell.find('a').text.strip()
                link = "https://new.kenyalaw.org" + title_cell.find('a')['href']
                category = cells[1].text.strip() or 'Weekly Issue'  # Default if empty
                index = cells[2].text.strip()
                download_link = link + "/source"
                
                data.append({
                    'Date': date,
                    'Issue': category,
                    'Title': title,
                    'Page Link': link,
                    'Download Link': download_link
                })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Process each PDF and check for EPRA content
        for index, row in df.iterrows():
            try:
                # Download PDF
                response = requests.get(row['Download Link'], stream=True)
                response.raise_for_status()
                
                # Save PDF temporarily
                with open("download.pdf", 'wb') as file:
                    for chunk in response.iter_content(chunk_size=8192):
                        file.write(chunk)
                
                # Extract and check content
                doc = PdfReader("download.pdf")
                tbl_content = doc.pages[0].extract_text()
                
                # Check for EPRA or "The Energy Act" mention
                has_epra = bool(re.findall(
                    r"Energy and Petroleum Regulatory Authority",
                    tbl_content,
                    flags=re.IGNORECASE
                ))

                has_energy_act = bool(re.findall(
                    r"The Energy Act",
                    tbl_content,
                    flags=re.IGNORECASE
                ))

                # Determine the column value based on mentions
                if has_epra and has_energy_act:
                    df.at[index, 'EPRA/Energy Act'] = "EPRA & The Energy Act"
                elif has_epra:
                    df.at[index, 'EPRA/Energy Act'] = "EPRA"
                elif has_energy_act:
                    df.at[index, 'EPRA/Energy Act'] = "The Energy Act"
                else:
                    df.at[index, 'EPRA/Energy Act'] = ""
                
            except Exception as e:
                print(f"Error processing {row['Download Link']}: {str(e)}")
                df.at[index, 'EPRA/Energy Act'] = f"Error: {str(e)}"
            
            finally:
                # Clean up temporary file
                if os.path.exists("download.pdf"):
                    try:
                        os.remove("download.pdf")
                    except:
                        pass
        
        # # Convert URLs to clickable links with custom text (Page + Index for Page Link, Download + Index for Download Link)
        # df['Page Link'] = df.apply(lambda row: f'<a href="{row["Page Link"]}">Page {row["Title"]}</a>', axis=1)
        # df['Download Link'] = df.apply(lambda row: f'<a href="{row["Download Link"]}">Download {row["Title"]}</a>', axis=1)
        
        return date, df
        
    except Exception as e:
        print(f"Failed to scrape gazette data: {str(e)}")
        return None, None

In [4]:
date, df = scrape_kenya_gazettes()

print(date)
df

2024-11-22 00:00:00


Unnamed: 0,Date,Issue,Title,Page Link,Download Link,EPRA/Energy Act
0,2024-11-22,Weekly Issue,Kenya Gazette Vol. CXXVI-No. 203,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,


In [None]:
# Base URL components
base_url = "https://new.kenyalaw.org/gazettes/"
base_url1 = 'https://new.kenyalaw.org'

# Define the start and end years
start_year = 2006
end_year = 2017

# Initialize an empty list to hold all data
all_data = []

# Loop through each year
for year in range(start_year, end_year + 1):
    url = f"{base_url}{year}"
    page = requests.get(url)

    # Parse the HTML
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Find rows in the table, skipping the header row
    rows = soup.find_all("tr")[1:]

    # Extract data for each row
    for row in rows:
        cells = row.find_all('td')
        row_data = []
        link = None  # Initialize link variable to store href
        
        # Extract data from each <td> element
        for cell in cells:
            # Check for links in cells (if any)
            if cell.find('a'):
                link = cell.find('a').get('href')  # Extract the link (href)
                if link:
                    page_link = f"{base_url1}{link}"
                row_data.append(cell.find('a').get_text(strip=True))  # Add the link text
            else:
                row_data.append(cell.get_text(strip=True))  # Add plain text
        
        # Append the page link as a new column for the row
        row_data.append(page_link if link else None)

        # Create a new column that merges the page link with "/source"
        row_data.append(page_link + "/source" if link else None)
        
        # Add row data to all_data list
        all_data.append(row_data)

# Define column names based on the structure
columns = ['Group/Title', 'Category', 'Index', 'Date', 'Page Link', 'Download Link']

# Create DataFrame
df = pd.DataFrame(all_data, columns=columns)

# Drop rows where 'Download Link' is NaN
df = df.dropna(subset=['Download Link'])

# Convert the 'Date' column to datetime
df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, '%d %B %Y'))

# Replace cells with a single space
df['Category'].replace('', 'Weekly Issue', inplace=True)

for index, row in df.iterrows():
    download_link = row['Download Link']
    download = f"file_{index}.pdf"  # Create a unique filename for each PDF

    try:
        # Send an HTTP GET request to fetch the file
        response = requests.get(download_link, stream=True)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Save the file locally in chunks
        with open("download.pdf", 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        # Read the PDF and extract text from the first page
        doc = PdfReader("download.pdf")
        tbl_content = doc.pages[0].extract_text()

        # Check for EPRA or "The Energy Act" mention
        has_epra = bool(re.findall(
            r"Energy and Petroleum Regulatory Authority",
            tbl_content,
            flags=re.IGNORECASE
        ))

        has_energy_act = bool(re.findall(
            r"The Energy Act",
            tbl_content,
            flags=re.IGNORECASE
        ))

        # Determine the column value based on mentions
        if has_epra and has_energy_act:
            df.at[index, 'EPRA/Energy Act'] = "EPRA & The Energy Act"
        elif has_epra:
            df.at[index, 'EPRA/Energy Act'] = "EPRA"
        elif has_energy_act:
            df.at[index, 'EPRA/Energy Act'] = "The Energy Act"
        else:
            df.at[index, 'EPRA/Energy Act'] = ""

    except Exception as e:
        # Handle exceptions (e.g., network errors, file issues)
        df.at[index, 'EPRA/Energy Act'] = f"Error: {str(e)}"
        print(f"Failed to process {download_link}: {e}")

# Convert URLs to clickable links with custom text (Page + Index for Page Link, Download + Index for Download Link)
df['Page Link'] = df.apply(lambda row: f'<a href="{row["Page Link"]}">Page {row["Index"]}</a>', axis=1)
df['Download Link'] = df.apply(lambda row: f'<a href="{row["Download Link"]}">Download {row["Index"]}</a>', axis=1)

df = df.drop(columns=['Index'])

# Display DataFrame
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Category'].replace('', 'Weekly Issue', inplace=True)


Failed to process https://new.kenyalaw.org/akn/ke/officialGazette/2012-06-29/59/eng@2012-06-29/source: EOF marker not found
Failed to process https://new.kenyalaw.org/akn/ke/officialGazette/2012-06-01/58/eng@2012-06-29/source: 404 Client Error: Not Found for url: https://new.kenyalaw.org/akn/ke/officialGazette/2012-06-01/58/eng@2012-06-29/source
Failed to process https://new.kenyalaw.org/akn/ke/officialGazette/2015-07-03/70/eng@2015-07-03/source: EOF marker not found


Unnamed: 0,Group/Title,Category,Date,Page Link,Download Link,EPRA/Energy Act
1,Kenya Gazette Vol. CVIII-No. 88,Weekly Issue,2006-12-22,"<a href=""https://new.kenyalaw.org/akn/ke/offic...","<a href=""https://new.kenyalaw.org/akn/ke/offic...",
2,Kenya Gazette Vol. CVIII-No. 86,Special Issue,2006-12-12,"<a href=""https://new.kenyalaw.org/akn/ke/offic...","<a href=""https://new.kenyalaw.org/akn/ke/offic...",
3,Kenya Gazette Vol. CVIII-No. 85,Weekly Issue,2006-12-08,"<a href=""https://new.kenyalaw.org/akn/ke/offic...","<a href=""https://new.kenyalaw.org/akn/ke/offic...",
4,Kenya Gazette Vol. CVIII-No. 84,Special Issue,2006-12-01,"<a href=""https://new.kenyalaw.org/akn/ke/offic...","<a href=""https://new.kenyalaw.org/akn/ke/offic...",
5,Kenya Gazette Vol. CVIII-No. 83,Weekly Issue,2006-12-01,"<a href=""https://new.kenyalaw.org/akn/ke/offic...","<a href=""https://new.kenyalaw.org/akn/ke/offic...",


In [19]:
data = pd.read_csv("gazett notice db1.csv")

#  Function to extract the link from the HTML anchor tag
def extract_link(html):
    soup = BeautifulSoup(html, 'html.parser')
    link = soup.find('a')['href']
    return link

# Apply the function to the 'Link' column and extract the URLs
data['Page Link'] = data['Page Link'].apply(extract_link)
data['Download Link'] = data['Download Link'].apply(extract_link)

data

Unnamed: 0,Date,Issue,Title,Page Link,Download Link,EPRA/Energy Act
0,2024-11-15,Weekly Issue,Kenya Gazette Vol. CXXVI-No. 197,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,EPRA
1,2024-11-08,Weekly Issue,Kenya Gazette Vol. CXXVI-No. 192,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,
2,2024-10-31,Special Issue,Kenya Gazette Vol. CXXVI-No. 188,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,
3,2024-10-31,Weekly Issue,Kenya Gazette Vol. CXXVI-No. 185,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,
4,2024-10-31,Special Issue,Kenya Gazette Vol. CXXVI-No. 184,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,
...,...,...,...,...,...,...
2964,2006-01-27,Weekly Issue,Kenya Gazette Vol. CVIII-No. 7,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,
2965,2006-01-23,Special Issue,Kenya Gazette Vol. CVIII-No. 6,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,
2966,2006-01-20,Weekly Issue,Kenya Gazette Vol. CVIII-No. 5,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,
2967,2006-01-13,Weekly Issue,Kenya Gazette Vol. CVIII-No. 3,https://new.kenyalaw.org/akn/ke/officialGazett...,https://new.kenyalaw.org/akn/ke/officialGazett...,


In [22]:
data.to_csv("gazett notice db.csv")