<a href="https://colab.research.google.com/github/Eugikats/student-retention-prediction/blob/main/Group_9_Final_Presentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem Recap and Target/Features

Stanford's Sentiment Treebank (SST-2) is a binary text classification problem focusing on sentiment analysis. The goal is to predict the sentiment of a given movie review sentence as positive or negative.


Target Variable (y): The overall sentiment of the sentence – labeled positive (1) or negative (0).


Input Feature (X): The text of the movie review sentence itself. Each data point is one sentence (or phrase) from a Rotten Tomatoes review.

**Group Members**

Eugene Katusiime

Stephen Njuki

Joshua Kayongo

Akwi Tracy Aidah

Higenyi Yurri

Nabuyondo Hamirat Shibah

Alex Wadaba

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer

# Clustering / visualization
from scipy.cluster.hierarchy import linkage, dendrogram

# Imbalanced learning
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define paths
base_path = "/content/drive/MyDrive/Classroom/Refactory Data Science Program 2025 cohort 2/Group 9/Final Group Assignment/"
path_hh = f"{base_path}ug15_hhld.csv"
path_school = f"{base_path}ug15_school.csv"
path_village = f"{base_path}ug15_village.csv"

In [None]:
# Load random sample
sample_size = 20000
df_hh = pd.read_csv(path_hh, low_memory=False).sample(n=sample_size, random_state=42)
df_school = pd.read_csv(path_school, low_memory=False).sample(n=sample_size, random_state=42)
df_village = pd.read_csv(path_village, low_memory=False).sample(n=sample_size, random_state=42)

# Quick preview
print(df_hh.shape, df_school.shape, df_village.shape)
print(df_hh.head(), df_school.head(), df_village.head())

(20000, 156) (20000, 156) (20000, 156)
       id_database  id_district id_districtName    id_hh id_regionName  \
40381         UG15          226         Buyende  1478006       Eastern   
20123         UG15          201          Bugiri   739017       Eastern   
40280         UG15          226         Buyende  1493011       Eastern   
134936        UG15          231           Ngora  1626020       Eastern   
155826        UG15          215         Sironko  1164010       Eastern   

        id_village  county_code1          county  parishcode  subcounty_code  \
40381         1478             1         BUDIOPE           9               1   
20123          739             1  BUKOOLI  NORTH           9               8   
40280         1493             1         BUDIOPE           4               6   
134936        1626             1           NGORA          14               1   
155826        1164             1        BUDADIRI           2              19   

       subcounty_name parish_name  

In [None]:
# Merge Datasets
merge_keys = ["id_database","id_district","id_districtName","id_hh","id_regionName","id_village"]

df_hh_school = df_hh.merge(df_school, on=merge_keys, suffixes=('_hh','_sch'))
df_master = df_hh_school.merge(df_village, on=merge_keys, suffixes=('', '_vill'))

print("Merged shape:", df_master.shape)
print(df_master.columns.tolist())

Merged shape: (47048, 456)
['id_database', 'id_district', 'id_districtName', 'id_hh', 'id_regionName', 'id_village', 'county_code1_hh', 'county_hh', 'parishcode_hh', 'subcounty_code_hh', 'subcounty_name_hh', 'parish_name_hh', 'locationcode_hh', 'location_hh', 'ea_code_hh', 'eacode_hh', 'village_estate_hh', 'urban_code_hh', 'no_of_hhs_hh', 'males_hh', 'females_hh', 'sample_no_hh', 'ea_code1_hh', 'validationcode_hh', 'date_hh', 'hhno_hh', 'answering_person_hh', 'hh_gender_hh', 'hh_age_hh', 'hh_edu_raw_hh', 'household_visited_hh', 'whichyear_hh', 'home_language_hh', 'hh_size_hh', 'hh_males_hh', 'hh_females_hh', 'house_wall_hh', 'house_lighting_hh', 'asset_toilet_hh', 'mealsPerDay_hh', 'eat_veg_hh', 'eat_fruit_hh', 'drink_milk_hh', 'asset_tv_hh', 'asset_radio_hh', 'asset_computer_hh', 'asset_phone_hh', 'asset_car_hh', 'asset_motorbike_hh', 'asset_bicycle_hh', 'asset_cattle_hh', 'asset_sheep_goat_hh', 'h107_donkey_hh', 'h107_camel_hh', 'h107_pig_hh', 'h107other_hh', 'water_source_hh', 'wate

In [None]:
# Feature and target selection
selected_cols = [
    "dropout", "age", "gender", "grade", "disability", "biological_parents",
    "mothers_edu", "mothers_toschool", "hh_edu", "hh_gender", "hh_age",
    "hh_size", "hh_males", "hh_females", "asset_tv", "asset_radio",
    "asset_computer", "asset_phone", "asset_car", "asset_motorbike",
    "asset_bicycle", "asset_cattle", "asset_sheep_goat", "asset_elec",
    "asset_water", "asset_toilet", "water_source", "water_source_distance",
    "treat_water", "id_regionName", "id_districtName", "urban_code"
]

available_cols = [col for col in selected_cols if col in df_master.columns]
df_filtered = df_master[available_cols].copy()

# Drop rows with missing target and keep Yes/No only
df_filtered = df_filtered.dropna(subset=["dropout"])
df_filtered = df_filtered[df_filtered["dropout"].isin(["Yes", "No"])]


In [None]:
# Separate classes
df_yes = df_filtered[df_filtered["dropout"] == "Yes"]
df_no  = df_filtered[df_filtered["dropout"] == "No"]

# Sample each class for balance
n_samples = min(len(df_yes), len(df_no), 1000)  # adjust 1000 if needed
df_yes_balanced = df_yes.sample(n_samples, random_state=42)
df_no_balanced  = df_no.sample(n_samples, random_state=42)

# Combine and shuffle
df_final = pd.concat([df_yes_balanced, df_no_balanced]).sample(frac=1, random_state=42)

# Check
print("Final dataset shape:", df_final.shape)
print(df_final["dropout"].value_counts())

# Save
df_final.to_csv(f"{base_path}df_final.csv", index=False)

Final dataset shape: (1948, 32)
dropout
No     974
Yes    974
Name: count, dtype: int64


In [None]:
print(df_final.columns.tolist())

['dropout', 'age', 'gender', 'grade', 'disability', 'biological_parents', 'mothers_edu', 'mothers_toschool', 'hh_edu', 'hh_gender', 'hh_age', 'hh_size', 'hh_males', 'hh_females', 'asset_tv', 'asset_radio', 'asset_computer', 'asset_phone', 'asset_car', 'asset_motorbike', 'asset_bicycle', 'asset_cattle', 'asset_sheep_goat', 'asset_elec', 'asset_water', 'asset_toilet', 'water_source', 'water_source_distance', 'treat_water', 'id_regionName', 'id_districtName', 'urban_code']
