In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    GroupKFold,
    GroupShuffleSplit,
    GridSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error

import joblib

RND = 42



In [2]:
df_train = pd.read_csv("C:\\Users\\dipes\\Desktop\\election-analytics-nepal-2082\\data\\processed\\training_pr_base.csv")  # rows for 2079
df_pred = pd.read_csv("C:\\Users\\dipes\\Desktop\\election-analytics-nepal-2082\\data\\processed\\base_pr_2082.csv")  # 2082 skeleton

In [3]:
df_train.head()

Unnamed: 0,district_id,party_id,election_year,vote_share,vote_share_lag,vote_share_change,is_new_party,Absent rate within country 2078,Absent rate abroad 2078,Literacy Rate 2078,...,age_30-39_ratio,age_40-49_ratio,age_50-59_ratio,age_60-69_ratio,age_70+_ratio,female_ratio,valid_turnout,valid_turnout_lag,facebook_log,leader_following_log
0,1,P001,2079,42.369621,44.075145,-1.705523,0,7.14,5.17,82.21,...,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,52.41,60.02,5.860786,6.47851
1,1,P002,2079,34.007651,31.449762,2.557889,0,7.14,5.17,82.21,...,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,52.41,60.02,5.70711,6.43294
2,1,P003,2079,11.156498,13.475434,-2.318936,0,7.14,5.17,82.21,...,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,52.41,60.02,5.081404,6.552508
3,1,P004,2079,2.113726,0.0,2.113726,1,7.14,5.17,82.21,...,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,52.41,60.02,5.525453,7.09091
4,1,P005,2079,1.376732,1.232574,0.144158,0,7.14,5.17,82.21,...,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,52.41,60.02,3.713572,6.746412


In [4]:
df_train.drop(columns=["party_id", "election_year", "vote_share", "vote_share_lag", "vote_share_change", "is_new_party","IS_ALTERNATIVE_FORCE?","GenZ_and_Youth_Favored?","Was_Part_Of_Ousted_Government?","IS_Major?","Facebook_Presence (In Thousands)","Top_leader_fb (in Thousands)","facebook_log","leader_following_log"], inplace=True)

In [6]:
df_pred.drop(columns=["party_id", "election_year", "vote_share_lag", "is_new_party","IS_ALTERNATIVE_FORCE?","GenZ_and_Youth_Favored?","Was_Part_Of_Ousted_Government?","IS_Major?","Facebook_Presence (In Thousands)","Top_leader_fb (in Thousands)","facebook_log","leader_following_log"], inplace=True)

In [6]:
df_train = df_train.drop_duplicates().reset_index(drop=True)
df_pred = df_pred.drop_duplicates().reset_index(drop=True)

In [7]:
df_train.head()

Unnamed: 0,district_id,Absent rate within country 2078,Absent rate abroad 2078,Literacy Rate 2078,Primary Education % 2078,Lower Secondary % 2078,Upper Secondary % 2078,SLC or SEE % 2078,Intermediate & equivalent % 2078,Graduate & equivalent % 2078,Post graduate equivalent & above % 2078,age_18-29_ratio,age_30-39_ratio,age_40-49_ratio,age_50-59_ratio,age_60-69_ratio,age_70+_ratio,female_ratio,valid_turnout,valid_turnout_lag
0,1,7.14,5.17,82.21,30.53,21.99,19.39,9.24,9.82,2.58,1.07,0.208919,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,52.41,60.02
1,2,9.86,6.79,82.29,28.29,21.87,19.58,10.85,10.29,2.31,0.98,0.186077,0.263347,0.17499,0.139309,0.12099,0.115286,0.471929,51.24,54.97
2,3,6.89,6.74,83.37,27.97,21.81,19.62,9.34,11.42,3.16,1.3,0.167327,0.253653,0.198879,0.149377,0.120988,0.109776,0.484367,55.09,61.83
3,4,3.86,9.15,82.83,25.65,19.71,18.09,12.22,12.89,3.88,1.54,0.159176,0.227545,0.220231,0.165927,0.113042,0.114078,0.50091,62.72,65.92
4,5,12.11,5.0,79.7,33.99,22.99,16.17,8.64,8.9,2.29,1.11,0.196523,0.253139,0.188556,0.140592,0.116902,0.104288,0.482246,52.83,60.3


In [8]:
TARGET = "valid_turnout"
GROUP = "district_id"

X = df_train.drop(columns=[TARGET, GROUP])
y = df_train[TARGET]
groups = df_train[GROUP]

In [9]:
gss = GroupShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=RND
)

train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = groups.iloc[train_idx]



In [10]:
gkf = GroupKFold(n_splits=5)

In [11]:
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(random_state=RND))
])


In [12]:
enet_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", ElasticNet(
        max_iter=10_000,
        random_state=RND
    ))
])

In [13]:
ridge_grid = {
    "model__alpha": np.logspace(-3, 3, 20)
}

enet_grid = {
    "model__alpha": np.logspace(-3, 2, 10),
    "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
}


In [14]:
ridge_gs = GridSearchCV(
    ridge_pipe,
    ridge_grid,
    cv=gkf,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_gs.fit(X_train, y_train, groups=groups_train)


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,{'model__alpha': array([1.0000...00000000e+03])}
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,GroupKFold(n_...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,np.float64(2.976351441631316)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [15]:
enet_gs = GridSearchCV(
    enet_pipe,
    enet_grid,
    cv=gkf,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

enet_gs.fit(X_train, y_train, groups=groups_train)


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__alpha': array([1.0000...00000000e+02]), 'model__l1_ratio': [0.1, 0.3, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,GroupKFold(n_...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,np.float64(0.1668100537200059)
,l1_ratio,0.9
,fit_intercept,True
,precompute,False
,max_iter,10000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42


In [16]:
ridge_pred = ridge_gs.best_estimator_.predict(X_test)
enet_pred = enet_gs.best_estimator_.predict(X_test)

ridge_mae = mean_absolute_error(y_test, ridge_pred)
enet_mae = mean_absolute_error(y_test, enet_pred)

print(f"Ridge MAE (district holdout): {ridge_mae:.2f}")
print(f"ElasticNet MAE (district holdout): {enet_mae:.2f}")


Ridge MAE (district holdout): 2.70
ElasticNet MAE (district holdout): 2.49


In [17]:
if ridge_mae <= enet_mae:
    final_model = ridge_gs.best_estimator_
    model_name = "ridge"
else:
    final_model = enet_gs.best_estimator_
    model_name = "elasticnet"

print(f"Selected model: {model_name}")


Selected model: elasticnet


In [18]:
final_model.fit(X, y)


0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,np.float64(0.1668100537200059)
,l1_ratio,0.9
,fit_intercept,True
,precompute,False
,max_iter,10000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42


In [21]:
artifact_path = f"artifacts/turnout_model_{model_name}.joblib"
joblib.dump(final_model, artifact_path)

print(f"Model saved to: {artifact_path}")


Model saved to: artifacts/turnout_model_elasticnet.joblib


In [22]:
df_pred.head()

Unnamed: 0,district_id,Absent rate within country 2078,Absent rate abroad 2078,Literacy Rate 2078,Primary Education % 2078,Lower Secondary % 2078,Upper Secondary % 2078,SLC or SEE % 2078,Intermediate & equivalent % 2078,Graduate & equivalent % 2078,Post graduate equivalent & above % 2078,female_ratio,age_18-29_ratio,age_30-39_ratio,age_40-49_ratio,age_50-59_ratio,age_60-69_ratio,age_70+_ratio,total_voters,valid_turnout_lag
0,1,7.14,5.17,82.21,30.53,21.99,19.39,9.24,9.82,2.58,1.07,0.465442,0.208919,0.26093,0.179692,0.141929,0.113764,0.094767,90327,52.41
1,2,9.86,6.79,82.29,28.29,21.87,19.58,10.85,10.29,2.31,0.98,0.4695,0.186077,0.263347,0.17499,0.139309,0.12099,0.115286,143408,51.24
2,3,6.89,6.74,83.37,27.97,21.81,19.62,9.34,11.42,3.16,1.3,0.483838,0.167327,0.253653,0.198879,0.149377,0.120988,0.109776,231809,55.09
3,4,3.86,9.15,82.83,25.65,19.71,18.09,12.22,12.89,3.88,1.54,0.497778,0.159176,0.227545,0.220231,0.165927,0.113042,0.114078,713537,62.72
4,5,12.11,5.0,79.7,33.99,22.99,16.17,8.64,8.9,2.29,1.11,0.480298,0.196523,0.253139,0.188556,0.140592,0.116902,0.104288,119630,52.83


In [26]:
X.head()

Unnamed: 0,Absent rate within country 2078,Absent rate abroad 2078,Literacy Rate 2078,Primary Education % 2078,Lower Secondary % 2078,Upper Secondary % 2078,SLC or SEE % 2078,Intermediate & equivalent % 2078,Graduate & equivalent % 2078,Post graduate equivalent & above % 2078,age_18-29_ratio,age_30-39_ratio,age_40-49_ratio,age_50-59_ratio,age_60-69_ratio,age_70+_ratio,female_ratio,valid_turnout_lag
0,7.14,5.17,82.21,30.53,21.99,19.39,9.24,9.82,2.58,1.07,0.208919,0.26093,0.179692,0.141929,0.113764,0.094767,0.466727,60.02
1,9.86,6.79,82.29,28.29,21.87,19.58,10.85,10.29,2.31,0.98,0.186077,0.263347,0.17499,0.139309,0.12099,0.115286,0.471929,54.97
2,6.89,6.74,83.37,27.97,21.81,19.62,9.34,11.42,3.16,1.3,0.167327,0.253653,0.198879,0.149377,0.120988,0.109776,0.484367,61.83
3,3.86,9.15,82.83,25.65,19.71,18.09,12.22,12.89,3.88,1.54,0.159176,0.227545,0.220231,0.165927,0.113042,0.114078,0.50091,65.92
4,12.11,5.0,79.7,33.99,22.99,16.17,8.64,8.9,2.29,1.11,0.196523,0.253139,0.188556,0.140592,0.116902,0.104288,0.482246,60.3


In [27]:
x_pred = df_pred[['Absent rate within country 2078', 'Absent rate abroad 2078', 'Literacy Rate 2078', 'Primary Education % 2078', 'Lower Secondary % 2078', 'Upper Secondary % 2078', 'SLC or SEE % 2078', 'Intermediate & equivalent % 2078', 'Graduate & equivalent % 2078', 'Post graduate equivalent & above % 2078', 'age_18-29_ratio', 'age_30-39_ratio', 'age_40-49_ratio', 'age_50-59_ratio', 'age_60-69_ratio', 'age_70+_ratio', 'female_ratio', 'valid_turnout_lag']]

In [31]:
df_pred['predicted_turnout'] = final_model.predict(x_pred)

In [33]:
df_pred.to_csv("C:\\Users\\dipes\\Desktop\\election-analytics-nepal-2082\\data\\results\\turnout_predictions_using_rate_2082.csv", index=False)