In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer


library_df = pd.read_csv("../Datasets/Merged_Data/merged_data.csv")

final_features = [
    "UNEMPLOYMENT_RATE", "POPULATION", "MEDIAN_INCOME", "BACHELORS_PERCENT",
    "TOTCIR", "F_TOTCIR", "VISITS", "F_VISITS", "REGBOR", "F_REGBOR", "GPTERMS",
    "F_GPTERM", "TOTSTAFF", "F_TOTSTF", "HRS_OPEN", "F_HRS_OP", "TOTPRO", "F_TOTPRO",
    "TOTATTEN", "F_TOTATT", "TOTINCM", "F_TOTINC", "TOTOPEXP", "F_TOTOPX"
    ]

analysis_df = library_df.loc[:, final_features].copy()

In [None]:
# Step 1a: Replace the sentinel for missing income
analysis_df["MEDIAN_INCOME"] = analysis_df["MEDIAN_INCOME"].replace(-666666666.0, np.nan)

# Step 1b: Treat zeros in the other three features as missing
for col in ["POPULATION", "BACHELORS_PERCENT", "UNEMPLOYMENT_RATE"]:
    analysis_df[col] = analysis_df[col].replace(0.0, np.nan)

# Step 1c: Drop ZIPs with truly zero population
analysis_df = analysis_df[analysis_df["POPULATION"].notna()]

In [3]:
feature_to_flag = {
    "TOTCIR": "F_TOTCIR",
    "VISITS": "F_VISITS",
    "HRS_OPEN": "F_HRS_OP",
    "GPTERMS": "F_GPTERM",
    "TOTATTEN": "F_TOTATT",
    "REGBOR": "F_REGBOR",
    "TOTPRO": "F_TOTPRO",
    "TOTSTAFF": "F_TOTSTF",
    "TOTINCM": "F_TOTINC",
    "TOTOPEXP": "F_TOTOPX"
}

drop_flags = ['U_22', 'H_22', 'L_22']

In [None]:
# Step 2: Convert -1 and -3 into NaN for each feature
for feat in feature_to_flag:
    if feat in analysis_df:
        analysis_df[feat] = analysis_df[feat].replace([-1, -3], np.nan)

In [None]:
# Step 3: Null‑out values whose flag indicates suppression/unusable data
for feat, flag in feature_to_flag.items():
    if flag in analysis_df.columns:
        mask = analysis_df[flag].isin(drop_flags)
        analysis_df.loc[mask, feat] = np.nan

In [None]:
# Step 4: Create binary “_imputed” indicator columns
for feat, flag in feature_to_flag.items():
    if flag in analysis_df.columns:
        analysis_df[f"{feat}_imputed"] = analysis_df[flag].str.startswith("I")

In [None]:
# Step 5: Drop the original F_* flag columns
analysis_df.drop(columns=list(feature_to_flag.values()), inplace=True)

In [None]:
# Step 6a: List all columns to feed into the imputer:
numeric_cols = list(feature_to_flag.keys()) + ["MEDIAN_INCOME", "BACHELORS_PERCENT", "UNEMPLOYMENT_RATE"]

# Step 6b: Instantiate and apply KNNImputer
knn = KNNImputer(n_neighbors=5)
analysis_df[numeric_cols] = knn.fit_transform(analysis_df[numeric_cols])