In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import warnings
from urllib3.exceptions import InsecureRequestWarning

# Suppress warnings
warnings.simplefilter("ignore", InsecureRequestWarning)
warnings.simplefilter("ignore", FutureWarning)

In [2]:
# Urls to fetch data from 
urls = [
    "https://statistics.kilimo.go.ke/en/KenyafarmingHH/",
    "https://statistics.kilimo.go.ke/en/2_2/",
    "https://statistics.kilimo.go.ke/en/1_3/",
    "https://statistics.kilimo.go.ke/en/2_3a/",
    "https://statistics.kilimo.go.ke/en/2_3c/",
    "https://statistics.kilimo.go.ke/en/1_7/"
]

# Dictionary to store DataFrames
dfs = {}

# Loop through each URL and store the DataFrames
for i, url in enumerate(urls, start=1):
    # Request the webpage content, ignoring SSL verification
    response = requests.get(url, verify=False)
    
    # Parse the HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table
    table = soup.find('table')
    
    # Convert the table to a pandas DataFrame
    df = pd.read_html(str(table))[0]
    
    # Store the DataFrame in the dictionary
    dfs[f'df_{i}'] = df

# Drop the first row (row 0) from all DataFrames in the dfs dictionary
for key in dfs:
    dfs[key] = dfs[key].drop(0).reset_index(drop=True)

#### Performing EDA on the fetched Data 

#### Converting dataframe names to more meaningful names

In [3]:
# Rename the DataFrames
rename_map = {
    'df_1': 'kenya_farming_households',
    'df_2': 'agricultural_subsector',
    'df_3': 'agriculture_main_purpose',
    'df_4': 'agriculture_crop_main_purpos',
    'df_5': 'livestock_Farming',
    'df_6': 'agriculture_size'
}

- I notices that the dataframes had no county column name or there was some inconsistency.
- I therefore decided to create standard column name for the county column

In [4]:
# Create new variables in the global scope for each renamed DataFrame
for old_name, new_name in rename_map.items():
    globals()[new_name] = dfs[old_name]

for key in dfs:
    # Replace '-' with 'county' and change 'County' to 'county'
    dfs[key].columns = (
        dfs[key].columns.str.replace('-', 'county', regex=False)
                         .str.replace('County', 'county', regex=False)
    )

#### NOTE: 
- After conducting EDA and checking for null/missing Values, noticed that there was no much cleaning to perform other than standardizing the columns 

In [None]:
##

In [5]:
# Calculate Total Production
agriculture_crop_main_purpos['Total Production'] = agriculture_crop_main_purpos['Subsistence Crop Farming'] + agriculture_crop_main_purpos['Commercial Crop Production']

In [6]:
# Calculate the ratio: Crop Farming Households / Farming Households
agriculture_crop_main_purpos['Crop Farming Ratio'] = agriculture_crop_main_purpos['Crop Farming Households'] / agriculture_crop_main_purpos['Farming Households']

In [7]:
merged_data = agriculture_main_purpose.merge(kenya_farming_households, on='county', suffixes=('', '_households'))\
    .merge(agricultural_subsector, on='county', suffixes=('', '_subsector'))\
    .merge(agriculture_crop_main_purpos, on='county', suffixes=('', '_crop'))\
    .merge(livestock_Farming, on='county', suffixes=('', '_livestock'))\
    .merge(agriculture_size, on='county', suffixes=('', '_size'))

In [8]:
merged_data.head()

Unnamed: 0,county,Farming Households,Subsistence Farming,Commercial Production,DK,Total,Farming Households_households,Non Farming Households,% of farming Households,Farming Households_subsector,...,1 – 1.99 Acre,2 – 4.99 Acre,5 – 9.99 Acre,10 – 19.99 Acre,20 – 49.99 Acre,50 – 99 Acre,100county499 Acres,500 – 999 Acre,1000+ Acre,Not Stated
0,Mombasa,13171,9891,2176,1104,376212,13171,363041,3.5,13171,...,2280,2877,561,236,86,50,-,-,-,1013
1,Kwale,109040,97816,9788,1436,172767,109040,63727,63.1,109040,...,18299,46609,19716,8767,3329,722,15,1,-,1236
2,Kilifi,162648,149648,10890,2110,297935,162648,135287,54.6,162648,...,46821,74081,13464,4335,1344,297,14,2,3,1903
3,Tana River,34989,29089,5153,747,66964,34989,31975,52.3,34989,...,6259,12821,3619,1714,572,323,9,1,6,447
4,Lamu,20254,13614,4947,1693,34223,20254,13969,59.2,20254,...,2870,6573,3655,2128,472,26,6,-,1,1636


In [9]:
# List of columns to drop
columns_to_drop = [
    'DK Farming Households', 
    'Non Farming Households', 
    'Farming Households_subsector', 
    'Farming Households_crop', 
    'DK_crop',  
    'DK_livestock', 
    'Total_size', 
    'Not Stated',  
    '1000+ Acre',  
    '100county499 Acres', 
    '500 – 999 Acre', 
    'Livestock Farming Households_livestock', 
    'Farming Households_households',
    'Farming Households_livestock',
    'Crop Farming Households_crop',
    'DK',
    'Total',
    'Crop Farming Ratio',
]

# Dropping the specified columns
merged_data = merged_data.drop(columns=columns_to_drop, errors='ignore')

In [10]:
pd.set_option('display.max_columns', None)
merged_data.head()

Unnamed: 0,county,Farming Households,Subsistence Farming,Commercial Production,% of farming Households,Crop Farming Households,Livestock Farming Households,Aquaculture Households,Fishing Households,Subsistence Crop Farming,Commercial Crop Production,Total Production,Subsistence Livestock Farming,Commercial Livestock Production,Less than 1 Acre,1 – 1.99 Acre,2 – 4.99 Acre,5 – 9.99 Acre,10 – 19.99 Acre,20 – 49.99 Acre,50 – 99 Acre
0,Mombasa,13171,9891,2176,3.5,7207,8225,93,1409,6239,954,7193,6465,1704,6068,2280,2877,561,236,86,50
1,Kwale,109040,97816,9788,63.1,100010,70211,212,4452,90652,9278,99930,63266,6798,10346,18299,46609,19716,8767,3329,722
2,Kilifi,162648,149648,10890,54.6,149324,97900,360,6393,139438,9802,149240,89873,7902,20384,46821,74081,13464,4335,1344,297
3,Tana River,34989,29089,5153,52.3,16716,26182,207,2136,12207,4494,16701,23152,2746,9218,6259,12821,3619,1714,572,323
4,Lamu,20254,13614,4947,59.2,15195,13144,63,3162,10561,4624,15185,9744,3356,2887,2870,6573,3655,2128,472,26


In [11]:
county_data = pd.read_excel('county_data.xlsx')
county_data.info()

FileNotFoundError: [Errno 2] No such file or directory: 'county_data.xlsx'