In [4]:
import pandas as pd
import numpy as np
import geopandas as gpd
from itertools import combinations
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statistics as stats
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import sklearn
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import pickle
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')
import sys
from sklearn.metrics import r2_score
sys.path.insert(0, '../src/')


In [5]:
from google.colab import files

uploaded = files.upload()  # This opens a file browser

Saving engineered_data.csv to engineered_data.csv


In [12]:
# === 1️⃣ Load the dataset used for modeling ===
CSV_PATH = "engineered_data.csv"
df = pd.read_csv(CSV_PATH)

# IDs
df["FIPS"]  = df["FIPS"].astype(str).str.zfill(5)
df["STATE"] = df["FIPS"].str[:2]
df["Year"]  = df["Year"].astype(int)

# Only rows with a labeled target
dfm = df[~df["FI Rate"].isna()].copy()

# Build lag: previous year's FI per FIPS
dfm = dfm.sort_values(["FIPS", "Year"])
dfm["FI_prev"] = dfm.groupby("FIPS")["FI Rate"].shift(1)

# Define train/test windows
TRAIN_START = 2010
TEST_YEAR   = 2023
PREDICT_YEAR = 2026
train = dfm[(dfm["Year"] >= TRAIN_START) & (dfm["Year"] < TEST_YEAR)].copy()
test  = dfm[dfm["Year"] == TEST_YEAR].copy()


In [7]:
from google.colab import files

uploaded = files.upload()  # This opens a file browser

Saving best_linear_model.pickle to best_linear_model.pickle


In [13]:
# === 2️⃣ Load the best OLS pipeline ===
with open("best_linear_model.pickle", "rb") as f:
    m1 = pickle.load(f)

# === 3️⃣ Prepare features for prediction ===
IGNORE = {"FIPS","STATE","Year","FI Rate"}
num_all = [c for c in dfm.columns if c not in IGNORE and dfm[c].dtype != "O"]
cat_all = [c for c in dfm.columns if c not in IGNORE and dfm[c].dtype == "O"]

# Compute sparse features dropped in training
DROP_SPARSE = 0.60
miss = train[num_all + cat_all].isna().mean()
to_drop_sparse = miss[miss > DROP_SPARSE].index.tolist()

numeric_base     = [c for c in num_all if c not in to_drop_sparse]
categorical_base = [c for c in cat_all if c not in to_drop_sparse]

# Ensure FI_prev is kept
if "FI_prev" not in numeric_base and "FI_prev" in num_all:
    numeric_base = ["FI_prev"] + numeric_base

X_cols_base = numeric_base + categorical_base

# Split train/test features
X_train_full = train[X_cols_base].copy()
y_train      = train["FI Rate"].values
X_test_full  = test[X_cols_base].copy()
y_test       = test["FI Rate"].values

In [14]:
# === 4️⃣ Make predictions ===
# === 4️⃣ Make predictions ===
y_train_pred = m1.predict(X_train_full)
y_test_pred  = m1.predict(X_test_full)

# === 5️⃣ Evaluate model ===
def rmse(y_true, y_pred):
    return float(np.sqrt(((y_true - y_pred) ** 2).mean()))

print("Train R2:", r2_score(y_train, y_train_pred))
print("Train RMSE:", rmse(y_train, y_train_pred))
print("Test R2:", r2_score(y_test, y_test_pred))
print("Test RMSE:", rmse(y_test, y_test_pred))

# === 6️⃣ Create prediction dataframes ===
df_train_preds = pd.concat([train.reset_index(drop=True),
                            pd.Series(y_train_pred, name="Predicted FI Rate")], axis=1)
df_test_preds  = pd.concat([test.reset_index(drop=True),
                            pd.Series(y_test_pred, name="Predicted FI Rate")], axis=1)

# Example: show FI Rate vs Predicted FI Rate
print(df_test_preds[['FI Rate','Predicted FI Rate']].sample(10))




Train R2: 0.9886119403754191
Train RMSE: 0.004421655524221281
Test R2: 0.9949121193221729
Test RMSE: 0.002500297568029097
      FI Rate  Predicted FI Rate
1421    0.213           0.209136
2722    0.162           0.160665
3072    0.119           0.120129
176     0.243           0.235967
199     0.216           0.211453
1066    0.200           0.196746
2906    0.150           0.147499
3003    0.227           0.221312
1306    0.114           0.114835
55      0.182           0.179007


In [11]:
import plotly.express as px

def choropleth(df, value_col,  color_scale, title):
    # Ensure FIPS and Year
    df['FIPS'] = df['FIPS'].astype(str).str.zfill(5)
    df['Year'] = df['Year'].astype(int)

    df_plot = df[df['Year'] == 2023]

    fig = px.choropleth(df_plot,
                        geojson='https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json',
                        locations='FIPS',
                        color=value_col,
                        color_continuous_scale=color_scale,
                        scope="usa",
                        hover_data=['FI Rate', value_col])
    fig.update_layout(title_text=title)
    fig.show()

choropleth(df_test_preds, 'Predicted FI Rate', 'twilight', 'Model Predicted Food Insecurity Rates')