In [43]:
import pandas as pd
import numpy as np
import pickle as pkl
import lightgbm as lgb
import re
import time

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import  accuracy_score, mean_squared_error
from sklearn.base import clone

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss

class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 
    
def bootstrap(group, n=None):
    if n is None:
        n = len(group)
    return group.sample(n, replace=True)

#Categorical features that need to be one-hot encoded    
one_hot_fts = ['office_type']

#Rating is the only ordinal feature
ordinal_fts = ['final_rating']
ordinal_fts_ranking = ['Safe R', 'Likely R', 'Leans R', 'Toss-up', 'Leans D', 'Likely D', 'Safe D']

#Cont features that should be pass-throughed (aznd later scaled)
cont_fts = [
    "open_seat", "incumbent_differential", "special", "absenteeexcusereq", "pollhours", "avgpollhours", "minpollhours",
    "regdeadlines", "voteridlaws", "novoterid", "noallmailvote", "noearlyvote", "nofelonreg",
    "nofelonsregafterincar", "nonstrictid", "nonstrictphoto", "nopollplacereg", "nopr", "nosamedayreg",
    "nostateholiday", "pr16", "pr17", "pr175", "pr60", "pr90", "strictid", "strictphoto", "covi_num",
    "prev_dem_gen_tp", "prev_gen_margin", "weighted_genpoll", "weighted_genpoll_lower",
    "weighted_genpoll_upper", "unweighted_genpoll", "mean_specials_differential", 
    "house_chamber_margin", "senate_chamber_margin", "previous_cci", "current_cci", "change_cci",
    "previous_gas", "current_gas", "change_gas", "previous_unemployment", "current_unemployment",
    "change_unemployment",  "receipts", "from_committee_transfers", "disbursements",
    "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
    "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted", "num_polls",
    "unweighted_estimate", "unweighted_ci_lower", "unweighted_ci_upper", "weighted_estimate",
    "weighted_ci_lower", "weighted_ci_upper", "white_pct", "black_pct", "asian_pct", "hispanic_pct",
    "median_income", "impoverished_pct", "median_age", "renting_pct", "inflation", "isMidterm",
    "genballot_predicted_margin", "genballot_predicted_lower", "genballot_predicted_upper",
    "poll_fundamental_agree",  'receipts_DEM', 'receipts_REP', 'disbursements_DEM', 'disbursements_REP'
]

In [44]:
np.seterr(divide='ignore', invalid='ignore')

def optima_model(mean_model, std_model, mean_param_dict, std_param_dict, X, y, mean_kwargs, std_kwargs):
    """Performs hyperparameter optimization for a a given bootstrapped X 
    ## Parameters:
    mean_model: sklearnable model. We use LGBMRegressor. This model trains for the point estimates.
    std_model: sklearnable model. We use LGBMRegressor. This model trains for the standard deviations after the point estimates are trained.
    param_dict: dictionary of hyperparameters to optimize
    X: DataFrame with features
    y: Series with target variable"""
        
    X_other, y_other = X.loc[X['year'] <= 2022, :], y.loc[X['year'] <= 2022]
    X_train, X_test, y_train, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])
    
    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    #Preprocessing data: no need to scale data, because we use tree-based models which are monotonic-scale-invariant
    #Because we don't need to scale data, we don't have to include the column transformer in the final saved model
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])
    
    #--- First, we optimize the mean model ---
    def mean_objective(params):
        "Function that takes in hyperparameters and returns loss, that Hyperopt will minimize."        
        testing_loss = []
        accuracies = []
        for train_idx, test_idx in cv.split(X_train):
            reg = mean_model(**params, **mean_kwargs)
            pipe = Pipeline(steps = [
                ('preprocessing', preprocessor), 
                ('model', reg)])
            
            """Goes through each fold and calculates loss."""
            pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
            
            predictions = pipe.predict(X_train.iloc[test_idx])
            testing_loss.append(mean_squared_error(y_train.iloc[test_idx], predictions, squared = False))
            accuracies.append(accuracy_score(np.sign(y_train.iloc[test_idx]), np.sign(predictions)))
        print(accuracies)
         
        return {'loss': np.mean(testing_loss), 'status': STATUS_OK}

    "Hyperopt uses the TPE algorithm to optimize hyperparameters. We use the no_progress_loss function to stop early if we don't see progress."
    mean_best_params = fmin(fn=mean_objective,
                    space=mean_param_dict,
                    algo=tpe.suggest,
                    trials=Trials(),
                    early_stop_fn = no_progress_loss(1))
    
    mean_best_model = mean_model(**mean_best_params, **mean_kwargs)
    mean_best_pipe = Pipeline(steps = [
        ('preprocessing', preprocessor), 
        ('model', mean_best_model)])
    
           
    #--- Now, we optimize the standard deviation model ---
    #Begin by predicting the training data via the mean model
    def neg_log_likelihood(y, y_pred, y_std):
        return np.mean(0.5 * np.log(2 * np.pi * y_std ** 2) + ((y - y_pred) ** 2 / (2 * y_std ** 2)))

    
    mean_best_pipe.fit(X_train, y_train)
    means_predicted = mean_best_pipe.predict(X_train)
    std_y_train = np.abs(y_train - means_predicted)
    
    def std_objective(params):
        "Function that takes in hyperparameters and returns loss, that Hyperopt will minimize."        
        testing_loss = []
        z_scores = []
        for train_idx, test_idx in cv.split(X_train):                
            
            std_reg = std_model(**params, **std_kwargs)
            pipe = Pipeline(steps = [
                ('preprocessing', preprocessor), 
                ('model', std_reg)])
            
            """Goes through each fold and calculates loss."""
            pipe.fit(X_train.iloc[train_idx], std_y_train.iloc[train_idx])
            
            std_predictions = pipe.predict(X_train.iloc[test_idx])
            testing_loss.append(neg_log_likelihood(y_train.iloc[test_idx], means_predicted[test_idx], std_predictions))
            z_scores.append(np.mean(abs(y_train.iloc[test_idx] - means_predicted[test_idx]) / std_predictions))
        print(z_scores)
                     
        return {'loss': np.mean(testing_loss), 'status': STATUS_OK}
    
    "Hyperopt uses the TPE algorithm to optimize hyperparameters. We use the no_progress_loss function to stop early if we don't see progress."
    std_best_params = fmin(fn=std_objective,
                    space=std_param_dict,
                    algo=tpe.suggest,
                    trials=Trials(),
                    early_stop_fn = no_progress_loss(1))
    
    #once we get the best params for each, we train each sequentially and then return the fitted versions.
    mean_best_pipe.fit(X_other, y_other)
    mean_predictions = mean_best_pipe.predict(X_other)
    
    std_y_other = np.abs(y_other - mean_predictions)
    std_best_model = std_model(**std_best_params, **std_kwargs)
    std_best_pipe = Pipeline(steps = [
        ('preprocessing', preprocessor), 
        ('model', std_best_model)])
    std_best_pipe.fit(X_other, std_y_other)
    
    #Returns 2 fitted models
    return mean_best_pipe, std_best_pipe

In [45]:
data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X = data.drop(columns = ['margin'])
y = data['margin']


preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])

names_for_monotonicity = preprocessor.fit(X).get_feature_names_out()
before_processing_monotonic_columns = ['incumbent_differential', "receipts", "from_committee_transfers", "disbursements",'genballot_predicted_margin', 
                                       'specials_predicted_margin', 'unweighted_estimate', 'weighted_estimate', 'receipts_genballot_interaction',
                                       'disbursements_genballot_interaction', 'poll_fundamental_average', 'genballot_predicted_lower', 
                                       'genballot_predicted_upper']

monotonic_columns = ['num__' + name for name in before_processing_monotonic_columns] + ['ord__final_rating']

monotone_constraints = [1 if name in monotonic_columns else 0 for name in names_for_monotonicity]

# Define the search space for Hyperopt
mean_param_lgbm = {
    'num_leaves': hp.randint('num_leaves', 20, 70),  # Reduced the upper limit, 
    'n_estimators': hp.randint('n_estimators', 50, 200),  # Increased the range
    'learning_rate': hp.loguniform('learning_rate', -5, -2),  # Equivalent to about 0.0001 to 0.01
    'subsample_for_bin': hp.randint('subsample_for_bin', 20000, 200000),  # Narrowed the range
    'min_data_in_bin': hp.randint('min_data_in_bin', 1, 10), 
    'min_data_in_leaf': hp.randint('min_data_in_leaf', 1, 10),  # Reduced the upper limit
    'min_child_samples': hp.randint('min_child_samples', 20, 150),  # Increased the range for more regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.5),  # Increased upper limit for L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.5),  # Increased upper limit for L2 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),  # Reduced the upper limit
    'subsample': hp.uniform('subsample', 0.5, 0.8),  # Reduced the upper limit for more randomness
    'max_depth': hp.randint('max_depth', 2, 10),  # Added max_depth for additional control
}

mean_kwargs = {
    'boosting_type': 'dart', 
    'monotone_constraints': monotone_constraints,
    'monotone_constraints_method': 'advanced', 
    'verbose': -1
}

std_param_lgbm = {
    'num_leaves': hp.randint('num_leaves', 20, 70),  # Reduced the upper limit, 
    'n_estimators': hp.randint('n_estimators', 50, 200),  # Increased the range
    'learning_rate': hp.loguniform('learning_rate', -5, -2),  # Equivalent to about 0.0001 to 0.01
    'subsample_for_bin': hp.randint('subsample_for_bin', 20000, 200000),  # Narrowed the range
    'min_data_in_bin': hp.randint('min_data_in_bin', 1, 10), 
    'min_data_in_leaf': hp.randint('min_data_in_leaf', 1, 10),  # Reduced the upper limit
    'min_child_samples': hp.randint('min_child_samples', 20, 150),  # Increased the range for more regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.5),  # Increased upper limit for L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.5),  # Increased upper limit for L2 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),  # Reduced the upper limit
    'subsample': hp.uniform('subsample', 0.5, 0.8),  # Reduced the upper limit for more randomness
    'max_depth': hp.randint('max_depth', 2, 10),  # Added max_depth for additional control
}

std_kwargs = {
    'boosting_type': 'dart', 
    'verbose': -1
}

#Optimize the model
mean_model, std_model = optima_model(lgb.LGBMRegressor, lgb.LGBMRegressor, mean_param_lgbm, std_param_lgbm, X, y, mean_kwargs, std_kwargs)


[0.9197465681098205, 0.9424460431654677, 0.9126730564430245]           
[0.9503695881731784, 0.9532374100719424, 0.9531416400425985]                                                      
[0.9165786694825766, 0.9436450839328537, 0.9148029818956337]                                                      
  0%|          | 3/9223372036854775807 [00:29<25404194780371717:41:20,  9.92s/trial, best loss: 8.572827609743522]
[1.5550199287260107, 1.6407764381597234, 1.373680168895199]            
[0.9836432329693462, 1.0664403983683504, 0.8865585514169935]                                                      
[1.0912418658899485, 1.223466440065979, 0.9636387921025459]                                                       
  0%|          | 3/9223372036854775807 [00:10<8608863906351750:15:28,  3.36s/trial, best loss: 3.132743630226644]


In [50]:
from scipy.stats import norm
mean_model.fit(X.loc[X['year'] < 2022], y.loc[X['year'] < 2022])
mean_preds_for_training = mean_model.predict(X.loc[X['year'] < 2022])
std_model.fit(X.loc[X['year'] < 2022], np.abs(y.loc[X['year'] < 2022] - mean_preds_for_training))

mean_preds = mean_model.predict(X.loc[X['year'] == 2022])
std_preds = std_model.predict(X.loc[X['year'] == 2022])
wrong_indices = np.not_equal(np.sign(y.loc[X['year'] == 2022]), np.sign(mean_preds))
wrong_results_2022 = X.loc[X['year'] == 2022].loc[wrong_indices]
wrong_results_2022['mean_prediction'] = mean_preds[wrong_indices]
wrong_results_2022['std_prediction'] = std_preds[wrong_indices]
wrong_results_2022['z-value'] = np.abs(wrong_results_2022['mean_prediction']) / std_preds[wrong_indices]
wrong_results_2022['probability of being wrong'] = 1 - norm.cdf(wrong_results_2022['z-value'])

print(f"Wrong results_2020, {wrong_results_2022}")
        


Wrong results_2020,       Unnamed0  year state  district office_type open_seat  \
3726      3727  2022    CA        13       House      True   
3767      3768  2022    CO         8       House      True   
3880      3881  2022    MI         7       House      True   
3921      3922  2022    NC        13       House      True   
3941      3942  2022    NM         2       House     False   
3945      3946  2022    NV         3       House     False   
3949      3950  2022    NY         3       House      True   
3950      3951  2022    NY         4       House      True   
3961      3962  2022    NY        17       House      True   
3971      3972  2022    OH         1       House     False   
4002      4003  2022    PA         7       House     False   
4003      4004  2022    PA         8       House      True   
4010      4011  2022    PA        17       House      True   
4064      4065  2022    VA         2       House     False   
4077      4078  2022    WA         3       House  