In [25]:
import pandas as pd

# Load CSV from GitHub
url = "https://raw.githubusercontent.com/ChiriKamau/soma/main/data/distribution-of-population-age-3-years-and-above-by-school-attendance-status-area-of-residence-s.csv"
df = pd.read_csv(url)

# Remove commas from numeric columns
df = df.apply(lambda x: x.str.replace(",", "") if x.dtype == "object" else x)

# Convert numeric columns
for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Alphabetical list of counties
counties_alphabetical = [
    "Baringo","Bomet","Bungoma","Busia","Elgeyo Marakwet","Embu",
    "Garissa","Homa Bay","Isiolo","Kajiado","Kakamega","Kericho",
    "Kiambu","Kilifi","Kirinyaga","Kisii","Kisumu","Kitui","Kwale",
    "Laikipia","Lamu","Machakos","Makueni","Mandera","Marsabit",
    "Meru","Migori","Mombasa","Muranga","Nairobi City","Nakuru",
    "Nandi","Narok","Nyamira","Nyandarua","Nyeri","Samburu","Siaya",
    "Taita Taveta","Tana River","Tharaka Nithi","Trans Nzoia","Turkana",
    "Uasin Gishu","Vihiga","Wajir","West Pokot"
]

# Map uppercase county names to numbers
county_number_map = {c.upper(): i+1 for i, c in enumerate(counties_alphabetical)}

# Add County_Number column
county_numbers = []
current_county_number = None
for name in df["County/ Sub-County"]:
    if isinstance(name, str) and name.upper() in county_number_map:
        current_county_number = county_number_map[name.upper()]
        county_numbers.append(current_county_number)
    else:
        county_numbers.append(current_county_number)

df["County_Number"] = county_numbers

# Extract only subcounties
exclude = [x.upper() for x in counties_alphabetical] + ["KENYA","RURAL","URBAN","MALE","FEMALE","INTERSEX"]
df_subcounties = df[~df["County/ Sub-County"].str.upper().isin(exclude)].copy()

# Number subcounties alphabetically within each county (keep as float to avoid NaN issue)
df_subcounties["Subcounty_Number"] = df_subcounties.groupby("County_Number")["County/ Sub-County"] \
                                                   .rank(method="first", ascending=True)

# Optional: convert valid numbers to int, keep NaN as is
df_subcounties["Subcounty_Number"] = df_subcounties["Subcounty_Number"].apply(lambda x: int(x) if pd.notna(x) else x)

# Keep only necessary columns
df_numbered = df_subcounties[["County_Number","Subcounty_Number","County/ Sub-County","At School/ Learning Institution"]]

df_numbered.head(20)


Unnamed: 0,County_Number,Subcounty_Number,County/ Sub-County,At School/ Learning Institution
0,,,,
10,28.0,1.0,CHANGAMWE,39032.0
13,28.0,2.0,JOMVU,51892.0
16,28.0,3.0,KISAUNI,95844.0
19,28.0,4.0,LIKONI,79083.0
22,28.0,5.0,MVITA,46714.0
25,28.0,6.0,NYALI,65858.0
31,19.0,1.0,KINANGO,38765.0
34,19.0,2.0,LUNGA LUNGA,76787.0
37,19.0,3.0,MATUGA,76734.0


In [26]:
# Top 10 subcounties by population at school
top10 = df_numbered.sort_values("At School/ Learning Institution", ascending=False).head(10)
print("Top 10 Subcounties by Population at School:")
print(top10.to_string(index=False))

# Bottom 10 subcounties by population at school
bottom10 = df_numbered.sort_values("At School/ Learning Institution", ascending=True).head(10)
print("\nBottom 10 Subcounties by Population at School:")
print(bottom10.to_string(index=False))


Top 10 Subcounties by Population at School:
 County_Number  Subcounty_Number County/ Sub-County  At School/ Learning Institution
          30.0               2.0           EMBAKASI                         298718.0
          30.0               4.0           KASARANI                         246064.0
          30.0               9.0              NJIRU                         205700.0
          44.0               2.0    ELGEYO/MARAKWET                         203979.0
          26.0              17.0      THARAKA-NITHI                         144933.0
          30.0               1.0          DAGORETTI                         133970.0
          14.0               8.0            MALINDI                         132098.0
           3.0               5.0      BUNGOMA SOUTH                         127072.0
          31.0               5.0           NAIVASHA                         122184.0
          13.0              10.0              RUIRU                         116732.0

Bottom 10 Subcountie

In [27]:
# Get the County_Number for Murang'a
muranga_number = county_number_map["MURANGA"]

# Filter only subcounties in Murang'a
muranga_subcounties = df_subcounties[df_subcounties["County_Number"] == muranga_number]

# Keep only the subcounty names and their numbers
muranga_list = muranga_subcounties[["Subcounty_Number", "County/ Sub-County"]].sort_values("Subcounty_Number")

# Display the list
muranga_list.reset_index(drop=True)


Unnamed: 0,Subcounty_Number,County/ Sub-County
0,1.0,ABERDARE FOREST
1,2.0,GATANGA
2,3.0,KAHURO
3,4.0,KANDARA
4,5.0,KANGEMA
5,6.0,KIGUMO
6,7.0,MATHIOYA
7,8.0,MURANG'A EAST
8,9.0,MURANG'A SOUTH
