In [68]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor

In [69]:
# Load the data from a CSV file
df = pd.read_csv("../data/all_v2_data.csv", sep=",")
df_pred= pd.read_csv("../data/september_to_predict.csv", sep=",")

In [70]:
df.dropna(inplace=True)

In [71]:
df

Unnamed: 0,Mnd,Ap,Hoyre,Frp,SV,SP,KrF,Venstre,MDG,Rodt,...,Rodt_reg_lag3,Rodt_reg_lag6,Andre_reg_lag3,Andre_reg_lag6,ledighet,ledig_trend3,ledig_trend6,styringsrente,styringsrente_trend3,styringsrente_trend6
5,6/30/2008,26.0,17.6,28.9,6.9,5.3,6.1,5.9,0.0,1.8,...,0.0,0.0,0.0,0.0,2.9,0.067,0.050000,5.54,0.076667,0.048333
6,7/31/2008,26.9,17.2,30.2,6.6,4.7,6.1,5.4,0.0,1.9,...,0.0,0.0,0.0,0.0,2.9,0.033,0.050000,5.75,0.083333,0.083333
7,8/31/2008,26.6,16.2,30.9,6.7,5.2,5.5,6.0,0.0,1.7,...,0.0,0.0,0.0,0.0,3.0,0.033,0.066667,5.75,0.070000,0.083333
8,9/30/2008,28.9,15.2,29.9,6.3,5.0,5.7,5.9,0.0,1.5,...,0.0,0.0,0.0,0.0,3.1,0.067,0.066667,5.75,0.000000,0.073333
9,10/31/2008,30.6,16.1,26.7,7.3,5.3,5.6,5.5,0.0,1.2,...,0.0,0.0,0.0,0.0,3.1,0.033,0.050000,5.45,-0.100000,-0.008333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,12/31/2024,17.2,21.2,25.4,9.4,5.3,3.9,5.3,3.1,5.8,...,0.0,0.0,0.0,0.0,4.1,0.033,0.016667,4.50,0.000000,0.000000
204,1/31/2025,19.1,22.2,24.1,8.0,5.9,3.4,4.5,3.3,5.6,...,0.0,0.0,0.0,0.0,4.1,0.000,0.016667,4.50,0.000000,0.000000
205,2/28/2025,24.8,18.4,24.6,7.4,6.6,3.0,4.2,2.4,5.4,...,0.0,0.0,0.0,0.0,4.1,0.000,0.016667,4.50,0.000000,0.000000
206,3/31/2025,28.1,18.8,22.7,7.2,4.9,3.1,4.3,2.7,4.6,...,0.0,0.0,0.0,0.0,4.0,-0.033,0.000000,4.50,0.000000,0.000000


In [72]:
df_pred

Unnamed: 0,Mnd,Ap,Hoyre,Frp,SV,SP,KrF,Venstre,MDG,Rodt,...,Rodt_reg_lag3,Rodt_reg_lag6,Andre_reg_lag3,Andre_reg_lag6,ledighet,ledig_trend3,ledig_trend6,styringsrente,styringsrente_trend3,styringsrente_trend6
0,9/30/2025,,,,,,,,,,...,0,0,0,0,4,0,0,4,0,-0.033333


In [73]:
parties=['Ap', 'Hoyre', 'Frp', 'SV', 'SP', 'KrF', 'Venstre', 'MDG','Rodt', 'Andre']
predictions=[]

In [74]:
def train_and_evaluate(X, y, X_pred, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    r2_scores = []

    # loop over each time‐series fold
    for fold, (train_idx, test_idx) in enumerate(tscv.split(X), start=1):
        X_tr, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[test_idx]

        # define the stack+pipe fresh each fold
        estimators = [
            ('rf', RandomForestRegressor(n_estimators=1000, random_state=42)),
            ('xgb', GradientBoostingRegressor(n_estimators=1000, random_state=42)),
            ('ridge', RidgeCV())
        ]
        stacking_model = StackingRegressor(
            estimators=estimators,
            final_estimator=RidgeCV()
        )
        pipeline = Pipeline([
            ("feature_selection",
             SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=42),
                             threshold="median")),
            ("regressor", stacking_model)
        ])

        # fit & score
        pipeline.fit(X_tr, y_tr.values.ravel())
        y_val_pred = pipeline.predict(X_val)
        r2 = r2_score(y_val, y_val_pred)
        print(f"  Fold {fold} R²: {r2:.3f}")
        r2_scores.append(r2)

    avg_r2 = np.mean(r2_scores)
    print(f"  → Average R² over {n_splits} folds: {avg_r2:.3f}")

    # final fit on all data & predict
    pipeline.fit(X, y.values.ravel())
    p = pipeline.predict(X_pred)
    return avg_r2, p[0]

In [75]:
for party in parties:
    y = df[[party]]
    feature_cols = [
        f'{party}_var', f'{party}_skandale',
        f'{party}_skandale_lag3', f'{party}_skandale_lag6',
        f'{party}_reg', f'{party}_reg_lag3', f'{party}_reg_lag6',
        'ledighet', 'ledig_trend3', 'ledig_trend6',
        'styringsrente', 'styringsrente_trend3', 'styringsrente_trend6'
    ]
    X = df[feature_cols]
    X_pred = df_pred[feature_cols]
    
    r2, p = train_and_evaluate(X, y, X_pred)
    
    print(party + ": " + str(p))
    predictions.append({'party': party, 'prediction': p, "r2_score": r2})

  Fold 1 R²: 0.686
  Fold 2 R²: 0.247
  Fold 3 R²: 0.844
  Fold 4 R²: 0.343
  Fold 5 R²: 0.718
  → Average R² over 5 folds: 0.568
Ap: 30.192375291869446
  Fold 1 R²: 0.604
  Fold 2 R²: -0.148
  Fold 3 R²: 0.804
  Fold 4 R²: 0.811
  Fold 5 R²: 0.910
  → Average R² over 5 folds: 0.596
Hoyre: 16.232678384915324
  Fold 1 R²: -3.204
  Fold 2 R²: 0.772
  Fold 3 R²: 0.696
  Fold 4 R²: 0.563
  Fold 5 R²: 0.914
  → Average R² over 5 folds: -0.052
Frp: 19.046962835618412
  Fold 1 R²: -19.265
  Fold 2 R²: -0.757
  Fold 3 R²: 0.843
  Fold 4 R²: 0.648
  Fold 5 R²: 0.671
  → Average R² over 5 folds: -3.572
SV: 6.806704804791157
  Fold 1 R²: -2.984
  Fold 2 R²: -1.165
  Fold 3 R²: -2.411
  Fold 4 R²: 0.830
  Fold 5 R²: -0.446
  → Average R² over 5 folds: -1.235
SP: 8.483507607963016
  Fold 1 R²: -3.019
  Fold 2 R²: -0.156
  Fold 3 R²: 0.574
  Fold 4 R²: -1.698
  Fold 5 R²: 0.262
  → Average R² over 5 folds: -0.807
KrF: 3.4267743201241188
  Fold 1 R²: -0.570
  Fold 2 R²: 0.294
  Fold 3 R²: -0.173
  Fo

In [76]:
pred_df = pd.DataFrame(predictions)
total = pred_df['prediction'].sum()
pred_df['prediction'] = (pred_df['prediction'] / total) * 100
pred_df['prediction'] = pred_df['prediction'].round(2)

In [77]:
pred_df

Unnamed: 0,party,prediction,r2_score
0,Ap,29.94,0.567759
1,Hoyre,16.1,0.596251
2,Frp,18.89,-0.051947
3,SV,6.75,-3.572006
4,SP,8.41,-1.235136
5,KrF,3.4,-0.80737
6,Venstre,4.02,0.180363
7,MDG,3.58,0.299995
8,Rodt,4.38,-0.987183
9,Andre,4.55,-0.017115
