
# Uganda Admissions Data Cleaning & Clustering

This notebook loads, cleans, and enriches the admissions dataset with **district clusters** and **regional classifications** for further analysis and visualization.


In [58]:

import pandas as pd

# Load dataset
df = pd.read_excel("cleaned_district_codes.xlsx")
df.head()


Unnamed: 0,no,formid,index_no,name,ge,uace_,dcode,dname,course_code,course_name,tot_wt,program_name
0,1,25APF900350094290048,U2789/905,NIMUSIIMA SANDRAH,F,2024,55.0,WAKISO,ACC,BSC. Accounting (EVE),42.7,BSC. Accounting (EVE)
1,2,25APF561140686250786,U0025/529,BULUKUKU DANIEL EPHRAIM,M,2014,93.0,KIBUKU,ACC,BSC. Accounting (EVE),42.3,BSC. Accounting (EVE)
2,3,25APF530278631275455,U2977/518,MIREMBE ELIZABETH,F,2024,16.0,KAMPALA,ACC,BSC. Accounting (EVE),41.1,BSC. Accounting (EVE)
3,4,25APF1346250173272930,U1664/712,NAMATA MARTHA,F,2024,33.0,MASAKA,ACC,BSC. Accounting (EVE),40.7,BSC. Accounting (EVE)
4,5,25APF1553620758303031,U0004/754,NAKALEMA JOYCE,F,2024,33.0,MASAKA,ACC,BSC. Accounting (EVE),40.3,BSC. Accounting (EVE)


In [59]:
print(df.columns.tolist())

['no', 'formid', 'index_no', 'name', 'ge', 'uace_', 'dcode', 'dname', 'course_code', 'course_name', 'tot_wt', 'program_name']


In [60]:

# Strip column names and whitespace in string fields
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df['dname'] = df['dname'].str.strip().str.upper()
df['name'] = df['name'].str.strip().str.title()
df['ge'] = df['ge'].str.upper()
df.head()


Unnamed: 0,no,formid,index_no,name,ge,uace_,dcode,dname,course_code,course_name,tot_wt,program_name
0,1,25APF900350094290048,U2789/905,Nimusiima Sandrah,F,2024,55.0,WAKISO,ACC,BSC. Accounting (EVE),42.7,BSC. Accounting (EVE)
1,2,25APF561140686250786,U0025/529,Bulukuku Daniel Ephraim,M,2014,93.0,KIBUKU,ACC,BSC. Accounting (EVE),42.3,BSC. Accounting (EVE)
2,3,25APF530278631275455,U2977/518,Mirembe Elizabeth,F,2024,16.0,KAMPALA,ACC,BSC. Accounting (EVE),41.1,BSC. Accounting (EVE)
3,4,25APF1346250173272930,U1664/712,Namata Martha,F,2024,33.0,MASAKA,ACC,BSC. Accounting (EVE),40.7,BSC. Accounting (EVE)
4,5,25APF1553620758303031,U0004/754,Nakalema Joyce,F,2024,33.0,MASAKA,ACC,BSC. Accounting (EVE),40.3,BSC. Accounting (EVE)


In [61]:
cluster_map = {
    "URBAN-CITY": [
        "KAMPALA", "MBARARA", "JINJA", "GULU", "ARUA", 
        "LIRA", "MASAKA", "MBALE", "WAKISO", "HOIMA"
    ],
    "SEMI-URBAN": [
        "BUSHENYI", "IBANDA", "IGANGA", "KASESE", "KIBOGA",
        "KYOTERA", "KIRYANDONGO", "KITGUM", "KAMULI",
        "KAMWENGE", "KAYUNGA", "MASINDI", "MPIGI", "MUKONO",
        "MITYANA", "NEBBI", "NTUNGAMO", "PALLISA", "RUKUNGIRI",
        "SOROTI", "TORORO", "BUSIA", "KASSANDA", "KABAROLE"
    ],
    "RURAL-AGRICULTURAL": [
        "ADJUMANI", "AGAGO", "ALEBTONG", "AMOLATAR", "AMURIA",
        "AMURU", "APAC", "BUDAKA", "BUDUDA", "BUGIRI", "BUGWERI",
        "BUHWEJU", "BUIKWE", "BUKEDEA", "BUKOMANSIMBI", "BULAMBULI",
        "BULIISA", "BUNDIBUGYO", "BUNYANGABU", "BUTALEJA",
        "BUTAMBALA", "BUTEMBO", "BUYENDE", "DOKOLO", "GOMBA",
        "ISINGIRO", "KABERAMAIDO", "KALIRO", "KALUNGU",
        "KANUNGU", "KATAKWI", "KAZO", "KIBAALE", "KIBUKU",
        "KIKUUBE", "KIRUHURA", "KISORO", "KOBOKO", "KOLE",
        "KUMI", "KYANKWANZI", "KYEGEGWA", "KYENJOJO", "LAMWO",
        "LUUKA", "LUWEERO", "LWENGO", "MANAFWA", "MARACHA",
        "MAYUGE", "MITOOMA", "MOROTO", "MOYO", "MUBENDE",
        "NAKASEKE", "NAKASONGOLA", "NAMAYINGO", "NAMISINDWA",
        "NAMUTUMBA", "NGORA", "OBONGI", "OMORO", "OTUKE",
        "OYAM", "PAKWACH", "RAKAI", "RUBANDA", "RUBIRIZI",
        "RUKIGA", "SEMBABULE", "SERERE", "SHEEMA", "SIRONKO",
        "YUMBE", "ZOMBO", 
        "BUTEBO", "KABALE", "KAGADI", "KAKUMILO", "KAKUMIRO", 
        "KAPCHORWA", "KWANIA", "LYANTONDE", "MADI OKOLLO", 
        "NWOYA", "PADER"
    ],
    "REMOTE-UNDERDEVELOPED": [
        "ABIM", "AMUDAT", "BUVUMA", "BUKWO", "KAABONG", "KAPELABYONG",
        "KALANGALA", "KOTIDO", "KWEEN", "MADIOKOLLO", "NAPAK",
        "NAKAPIRIPIRIT", "NABILATUK", "NTOROKO", "TEREGO", 
        "KARENGA", "NON UGANDAN"
    ]
}


region_map = {
    "CENTRAL": [
        "BUIKWE", "BUKOMANSIMBI", "BUTAMBALA", "BUVUMA", "GOMBA", 
        "KALANGALA", "KALUNGU", "KAMPALA", "KAYUNGA", "KIBOGA", 
        "KYANKWANZI", "LUWEERO", "LWENGO", "LYANTONDE", "MASAKA", 
        "MITYANA", "MPIGI", "MUBENDE", "MUKONO", "NAKASEKE", 
        "NAKASONGOLA", "RAKAI", "SEMBABULE", "WAKISO", "KYOTERA", 
        "KASSANDA"
    ],
    "EASTERN": [
        "AMURIA", "BUDAKA", "BUDUDA", "BUGIRI", "BUKEDEA", 
        "BULAMBULI", "BUSIA", "BUTALEJA", "BUTEBO", "BUYENDE", 
        "IGANGA", "JINJA", "KALIRO", "KAMULI", "KAPCHORWA", 
        "KATAKWI", "KUMI", "KABERAMAIDO", "MANAFWA", "MAYUGE", 
        "MBALE", "NAMAYINGO", "NAMUTUMBA", "NGORA", "PALLISA", 
        "SERERE", "SIRONKO", "SOROTI", "TORORO", "NAMISINDWA", 
        "BUKWO", "KIBUKU", "LUUKA"
    ],
    "NORTHERN": [
        "ABIM", "ADJUMANI", "AGAGO", "ALEBTONG", "AMOLATAR", 
        "AMUDAT", "AMURU", "APAC", "ARUA", "DOKOLO", "GULU", 
        "KAABONG", "KITGUM", "KOBOKO", "KOLE", "KOTIDO", 
        "KWANIA", "KWEEN", "LIRA", "MADI OKOLLO", "MARACHA", 
        "MOROTO", "MOYO", "NABILATUK", "NAKAPIRIPIRIT", "NAPAK", 
        "NEBBI", "NWOYA", "NTOROKO", "OBONGI", "OMORO", 
        "OTUKE", "OYAM", "PADER", "PAKWACH", "YUMBE", "ZOMBO"
    ],
    "WESTERN": [
        "BUHWEJU", "BULIISA", "BUNDIBUGYO", "BUNYANGABU", "BUSHENYI", 
        "HOIMA", "IBANDA", "ISINGIRO", "KABALE", "KABAROLE", 
        "KAGADI", "KAKUMIRO", "KAKUMILO", "KAMWENGE", "KANUNGU", 
        "KASESE", "KIBAALE", "KIRUHURA", "KIRYANDONGO", "KISORO", 
        "KYEGEGWA", "KYENJOJO", "LAMWO", "MASINDI", "MBARARA", 
        "MITOOMA", "NTUNGAMO", "RUBANDA", "RUBIRIZI", "RUKIGA", 
        "RUKUNGIRI", "SHEEMA", 'KIKUUBE'
    ]
}


def classify_cluster(district):
    for cluster, districts in cluster_map.items():
        if district in districts:
            return cluster
    return "OTHER"

def classify_region(district):
    for region, districts in region_map.items():
        if district in districts:
            return region
    return "NON UGANDAN"

# Apply mappings
df['cluster'] = df['dname'].apply(classify_cluster)
df['region'] = df['dname'].apply(classify_region)

df[['dname', 'cluster', 'region']].drop_duplicates().head(10)


Unnamed: 0,dname,cluster,region
0,WAKISO,URBAN-CITY,CENTRAL
1,KIBUKU,RURAL-AGRICULTURAL,EASTERN
2,KAMPALA,URBAN-CITY,CENTRAL
3,MASAKA,URBAN-CITY,CENTRAL
5,NTUNGAMO,SEMI-URBAN,WESTERN
6,ARUA,URBAN-CITY,NORTHERN
7,IGANGA,SEMI-URBAN,EASTERN
8,LUWEERO,RURAL-AGRICULTURAL,CENTRAL
9,KASESE,SEMI-URBAN,WESTERN
10,LWENGO,RURAL-AGRICULTURAL,CENTRAL


In [63]:
# Check if any 'NON UGANDAN' or 'OTHER' values remain
df[df['region'].str.upper() == 'NON UGANDAN']
df[df['cluster'].str.upper() == 'OTHER']

Unnamed: 0,no,formid,index_no,name,ge,uace_,dcode,dname,course_code,course_name,tot_wt,program_name,cluster,region
10379,42,25APF1096305957317713,ZZVX90YU,Namuyanja Masituula,F,2024,32.0,LUWERO,BSA,BSC. Accounting,39.2,BSC. Accounting(Day),OTHER,NON UGANDAN


In [62]:
df.to_excel("cleaned_clustered_admissions(10).xlsx", index=False)
print("Files saved successfully!")


Files saved successfully!
