In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    GroupKFold,
    GroupShuffleSplit,
    GridSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error

import joblib

RND = 42


In [2]:
df_train = pd.read_csv("C:\\Users\\dipes\\Desktop\\election-analytics-nepal-2082\\data\\processed\\turnout_train_data.csv")

In [3]:
df_82 = pd.read_csv("C:\\Users\\dipes\\Desktop\\election-analytics-nepal-2082\\data\\processed\\turnout_prediction_base_2082.csv")

In [4]:
df_train.head()

Unnamed: 0,district_id,Absent rate within country 2078,Absent rate abroad 2078,Literacy Rate 2078,Primary Education % 2078,Lower Secondary % 2078,Upper Secondary % 2078,SLC or SEE % 2078,Intermediate & equivalent % 2078,Graduate & equivalent % 2078,...,age_30-39_ratio,age_40-49_ratio,age_50-59_ratio,age_60-69_ratio,age_70+_ratio,female_ratio,registered_voters_lag,valid_pr_votes_lag,registered_voters,valid_pr_votes
0,1,7.14,5.17,82.21,30.53,21.99,19.39,9.24,9.82,2.58,...,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,78397,47056,88285,46269
1,2,9.86,6.79,82.29,28.29,21.87,19.58,10.85,10.29,2.31,...,0.263347,0.17499,0.139309,0.12099,0.115286,0.471929,125219,68837,138932,71182
2,3,6.89,6.74,83.37,27.97,21.81,19.62,9.34,11.42,3.16,...,0.253653,0.198879,0.149377,0.120988,0.109776,0.484367,204046,126160,224877,123874
3,4,3.86,9.15,82.83,25.65,19.71,18.09,12.22,12.89,3.88,...,0.227545,0.220231,0.165927,0.113042,0.114078,0.50091,585166,385735,663311,416005
4,5,12.11,5.0,79.7,33.99,22.99,16.17,8.64,8.9,2.29,...,0.253139,0.188556,0.140592,0.116902,0.104288,0.482246,105046,63344,117554,62102


In [5]:
df_82.head()

Unnamed: 0,district_id,Absent rate within country 2078,Absent rate abroad 2078,Literacy Rate 2078,Primary Education % 2078,Lower Secondary % 2078,Upper Secondary % 2078,SLC or SEE % 2078,Intermediate & equivalent % 2078,Graduate & equivalent % 2078,...,age_18-29_ratio,age_30-39_ratio,age_40-49_ratio,age_50-59_ratio,age_60-69_ratio,age_70+_ratio,female_ratio,registered_voters_lag,valid_pr_votes_lag,registered_voters
0,1,7.14,5.17,82.21,30.53,21.99,19.39,9.24,9.82,2.58,...,0.208919,0.26093,0.179692,0.141929,0.113764,0.094767,0.465442,88285,46269,90327
1,2,9.86,6.79,82.29,28.29,21.87,19.58,10.85,10.29,2.31,...,0.186077,0.263347,0.17499,0.139309,0.12099,0.115286,0.4695,138932,71182,143408
2,3,6.89,6.74,83.37,27.97,21.81,19.62,9.34,11.42,3.16,...,0.167327,0.253653,0.198879,0.149377,0.120988,0.109776,0.483838,224877,123874,231809
3,4,3.86,9.15,82.83,25.65,19.71,18.09,12.22,12.89,3.88,...,0.159176,0.227545,0.220231,0.165927,0.113042,0.114078,0.497778,663311,416005,713537
4,5,12.11,5.0,79.7,33.99,22.99,16.17,8.64,8.9,2.29,...,0.196523,0.253139,0.188556,0.140592,0.116902,0.104288,0.480298,117554,62102,119630


In [6]:
TARGET = "valid_pr_votes"
GROUP = "district_id"

X = df_train.drop(columns=[TARGET, GROUP])
y = df_train[TARGET]
groups = df_train[GROUP]

In [7]:
gss = GroupShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=RND
)

train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = groups.iloc[train_idx]

In [8]:
gkf = GroupKFold(n_splits=5)

In [9]:
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(random_state=RND))
])

In [10]:
enet_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", ElasticNet(
        max_iter=10_000,
        random_state=RND
    ))
])

In [11]:
ridge_grid = {
    "model__alpha": np.logspace(-3, 3, 20)
}

enet_grid = {
    "model__alpha": np.logspace(-3, 2, 10),
    "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
}

In [12]:
ridge_gs = GridSearchCV(
    ridge_pipe,
    ridge_grid,
    cv=gkf,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_gs.fit(X_train, y_train, groups=groups_train)


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,{'model__alpha': array([1.0000...00000000e+03])}
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,GroupKFold(n_...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,np.float64(0.001)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [13]:
enet_gs = GridSearchCV(
    enet_pipe,
    enet_grid,
    cv=gkf,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

enet_gs.fit(X_train, y_train, groups=groups_train)


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__alpha': array([1.0000...00000000e+02]), 'model__l1_ratio': [0.1, 0.3, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,GroupKFold(n_...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,np.float64(0.1668100537200059)
,l1_ratio,0.7
,fit_intercept,True
,precompute,False
,max_iter,10000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42


In [14]:
ridge_pred = ridge_gs.best_estimator_.predict(X_test)
enet_pred = enet_gs.best_estimator_.predict(X_test)

ridge_mae = mean_absolute_error(y_test, ridge_pred)
enet_mae = mean_absolute_error(y_test, enet_pred)

print(f"Ridge MAE (district holdout): {ridge_mae:.2f}")
print(f"ElasticNet MAE (district holdout): {enet_mae:.2f}")

Ridge MAE (district holdout): 4763.69
ElasticNet MAE (district holdout): 7861.64


In [15]:
if ridge_mae <= enet_mae:
    final_model = ridge_gs.best_estimator_
    model_name = "ridge"
else:
    final_model = enet_gs.best_estimator_
    model_name = "elasticnet"

print(f"Selected model: {model_name}")

Selected model: ridge


In [16]:
final_model.fit(X, y)

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,np.float64(0.001)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [17]:
artifact_path = f"artifacts/vote_turnout_model_{model_name}.joblib"
joblib.dump(final_model, artifact_path)

print(f"Model saved to: {artifact_path}")


Model saved to: artifacts/vote_turnout_model_ridge.joblib


In [18]:
df_82["predicted_valid_pr_votes"] = final_model.predict(df_82.drop(columns=[GROUP])) 

In [20]:
df_train.head(20)

Unnamed: 0,district_id,Absent rate within country 2078,Absent rate abroad 2078,Literacy Rate 2078,Primary Education % 2078,Lower Secondary % 2078,Upper Secondary % 2078,SLC or SEE % 2078,Intermediate & equivalent % 2078,Graduate & equivalent % 2078,...,age_30-39_ratio,age_40-49_ratio,age_50-59_ratio,age_60-69_ratio,age_70+_ratio,female_ratio,registered_voters_lag,valid_pr_votes_lag,registered_voters,valid_pr_votes
0,1,7.14,5.17,82.21,30.53,21.99,19.39,9.24,9.82,2.58,...,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,78397,47056,88285,46269
1,2,9.86,6.79,82.29,28.29,21.87,19.58,10.85,10.29,2.31,...,0.263347,0.17499,0.139309,0.12099,0.115286,0.471929,125219,68837,138932,71182
2,3,6.89,6.74,83.37,27.97,21.81,19.62,9.34,11.42,3.16,...,0.253653,0.198879,0.149377,0.120988,0.109776,0.484367,204046,126160,224877,123874
3,4,3.86,9.15,82.83,25.65,19.71,18.09,12.22,12.89,3.88,...,0.227545,0.220231,0.165927,0.113042,0.114078,0.50091,585166,385735,663311,416005
4,5,12.11,5.0,79.7,33.99,22.99,16.17,8.64,8.9,2.29,...,0.253139,0.188556,0.140592,0.116902,0.104288,0.482246,105046,63344,117554,62102
5,6,10.16,6.51,81.94,27.0,20.59,20.14,12.24,10.57,2.96,...,0.255917,0.17902,0.145768,0.120924,0.094822,0.488496,66667,41255,71933,40657
6,7,15.12,5.77,78.87,32.51,22.23,18.71,9.79,7.73,1.87,...,0.244724,0.176216,0.154048,0.130798,0.133542,0.480252,113745,56893,123379,58743
7,8,12.58,6.37,81.36,28.56,21.65,18.84,10.61,10.25,2.74,...,0.246308,0.191556,0.157593,0.123243,0.106354,0.499568,105188,63112,116991,66320
8,9,3.79,6.61,78.61,26.96,19.12,16.83,12.43,12.81,4.3,...,0.231441,0.214057,0.163345,0.119722,0.115782,0.491068,644782,417601,735525,458853
9,10,3.08,6.7,78.1,27.98,18.84,16.83,12.22,12.58,3.72,...,0.241735,0.215905,0.167618,0.112844,0.104719,0.495896,474831,292993,544941,327498


In [21]:
df_82["predicted_rate"] = df_82["predicted_valid_pr_votes"] / df_82["registered_voters"] * 100

In [24]:
df_82["predicted_valid_pr_votes"].sum()

np.float64(10404605.122769851)