In [28]:
import pandas as pd
import numpy as np
#import shap
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, BaseCrossValidator
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, accuracy_score, median_absolute_error, make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss
from hyperopt.pyll.base import Apply


import xgboost
import lightgbm
from scipy.stats import loguniform, randint, uniform
from scipy.spatial.distance import mahalanobis
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence
from hpsklearn import HyperoptEstimator, any_regressor, any_preprocessing
from hyperopt import tpe

def incorrect_penalizer(y_true, y_pred, multiplier=1):
    """
    Scorer function where wrong predictions are penalized more than correct predictions.
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Calculate whether the signs of the true and predicted values are different
    sign_diff = np.sign(y_true) != np.sign(y_pred)
    
    # Compute absolute errors
    abs_errors = np.abs(y_true - y_pred)
    
    # Multiply the error where the signs are different
    penalized_errors = abs_errors * (1 + (multiplier - 1) * sign_diff.astype(int))
    
    return np.mean(penalized_errors)

#Creating a custom time series cross-validator
class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years)
    

Custom Time Series Cross-Validator -- Every year has a different number of races, so we can't use the regular time series CV

In [267]:
def custom_model(model, param_dict, X, y, iterations = 50):
   "Creating a custom CV to look through errors in a detailed manner"
   X_other, X_test, y_other, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                    y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])

   models = []
   test_scores = []
   for random_state in range(1, iterations):
      
      one_hot_fts = ['office_type', 'final_rating', 'open_seat']
      std_fts = ['midterm', 'incumbent_margin', 'covi_num','special', 'prev_gb_margin', 'prev2_gb_margin',
         'mean_specials_differential', 'pvi', 'previous_cci', 'current_cci',
         'previous_gas', 'current_gas',  'previous_unemployment',
         'current_unemployment', 'absenteeexcusereq', 'pollhours',
         'avgpollhours', 'maxpollhours', 'minpollhours', 'regdeadlines',
         'voteridlaws', 'novoterid', 'noallmailvote', 'noearlyvote',
         'nofelonreg', 'nofelonsregafterincar', 'nonstrictid', 'nonstrictphoto',
         'noonlineregistration', 'nopermanentabsentee', 'nopollplacereg', 'nopr',
         'nosamedayreg', 'nostateholiday', 'pr16', 'pr17', 'pr175', 'pr60',
         'pr90', 'strictid', 'strictphoto', 'house_chamber_margin',
         'senate_chamber_margin', 'change_cci', 'change_unemployment']
         
      preprocessor = ColumnTransformer([
      ('cat', OneHotEncoder(), one_hot_fts), 
      ('num', 'passthrough', std_fts)])
      
      parameters = {key: value.rvs(random_state=random_state) for key, value in param_dict.items()}
      model = model.set_params(**parameters)
      
      pipe = make_pipeline(preprocessor, model)
      models.append(pipe)
      
      folds = [(range(2002, 2006, 2), [2006, 2008]),
         (range(2002, 2010, 2), [2010, 2012]),
         (range(2002, 2014, 2), [2014, 2016]),
         (range(2002, 2018, 2), [2018, 2020])]
      fold_scores = []
      for train, test in folds:
            X_train, X_val = X_other.loc[X['year'].isin(train), :], X_other.loc[X['year'].isin(test), :]
            y_train, y_val = y_other[X_other['year'].isin(train)], y_other[X_other['year'].isin(test)]
            
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_val)
            fold_scores.append(median_absolute_error(y_val, y_pred))
            print(f'Mean absolute error for {train} is {mean_absolute_error(y_val, y_pred)}')
            print(f'Mean squared error for {train} is {mean_squared_error(y_val, y_pred)}')
            print(f'Max error for {train} is {max_error(y_val, y_pred)}')
            print(f'Median absolute error for {train} is {median_absolute_error(y_val, y_pred)}')
            print('---------------------------------------------------------')
      test_scores.append(np.mean(fold_scores))
      
   val_score = test_scores[np.argmin(test_scores)]
   best_model = models[np.argmin(test_scores)]
   best_model.fit(X_other, y_other)

   print(f"training score is {mean_absolute_error(y_other, best_model.predict(X_other))}")
   print(f"validation score is {val_score}")
   print(f"test_score is {mean_absolute_error(y_test, best_model.predict(X_test))}")
   return best_model, val_score   


In [23]:
data = pd.read_csv("../cleaned_data/Finalized Dataset.csv")
filtered_data = data.drop(columns = ['district']).assign(pvi = lambda x: x['pvi'] * 2, 
                                                         midterm = lambda x: x['year'] % 4 != 0) 
                                                        
X = filtered_data.drop(columns=['margin'])
y = filtered_data['margin']

param_dist_xgb = {
    'n_estimators': randint(10, 251),  # Discrete uniform distribution
    'max_depth': randint(3, 16),  # Discrete uniform distribution
    'learning_rate': uniform(0.001, 0.199),  # Continuous uniform distribution
    'subsample': uniform(0.3, 0.7),  # Continuous uniform distribution
    'colsample_bytree': uniform(0.3, 0.7),  # Continuous uniform distribution
    'min_child_weight': randint(5, 16),  # Discrete uniform distribution
    'gamma': uniform(0.01, 99.99),  # Continuous uniform distribution
    'reg_alpha': uniform(0.01, 99.99),  # Continuous uniform distribution
    'reg_lambda': uniform(0.01, 99.99)  # Continuous uniform distribution
}

#xgb = xgboost.XGBRegressor(n_jobs = -1)
#custom_model(xgb, param_dist_xgb, X, y, iterations = 300)

In [256]:
def run_model(model, param_dict, X, y, iterations = 75):
    """Runs through a given model to get the best estimator of that model, as well as the train/test score values."""
    X_train, X_test, y_train, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])
    
    folds = [(range(2002, 2006, 2), [2006, 2008]),
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])]

    cv = CustomTimeSeriesCV(folds)
        
    one_hot_fts = ['office_type', 'final_rating', 'open_seat']
    std_fts = ['midterm', 'incumbent_margin', 'covi_num','special', 'prev_gb_margin', 'prev2_gb_margin',
       'mean_specials_differential', 'pvi', 'previous_cci', 'current_cci',
       'previous_gas', 'current_gas',  'previous_unemployment',
       'current_unemployment', 'absenteeexcusereq', 'pollhours',
       'avgpollhours', 'maxpollhours', 'minpollhours', 'regdeadlines',
       'voteridlaws', 'novoterid', 'noallmailvote', 'noearlyvote',
       'nofelonreg', 'nofelonsregafterincar', 'nonstrictid', 'nonstrictphoto',
       'noonlineregistration', 'nopermanentabsentee', 'nopollplacereg', 'nopr',
       'nosamedayreg', 'nostateholiday', 'pr16', 'pr17', 'pr175', 'pr60',
       'pr90', 'strictid', 'strictphoto', 'house_chamber_margin',
       'senate_chamber_margin', 'change_cci', 'change_unemployment']
        
    preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), one_hot_fts), 
    ('num', 'passthrough', std_fts)])
    
    model_name = model.__class__.__name__
        
    param_dict = {f"{model_name.lower()}__{key}": value for key, value in param_dict.items()}
    
    pipe = make_pipeline(preprocessor, model)
    
    bayes = RandomizedSearchCV(pipe, param_dict, n_iter=iterations, scoring='neg_median_absolute_error', cv = cv, verbose = 1)
    #bayes = BayesSearchCV(pipe, param_dict, n_iter=iterations, scoring=custom_scorer, cv = cv, verbose = 1)
    
    np.int = int # bayes uses np.int, which is deprecated -- this removes the error!
    bayes.fit(X_train, y_train)
    
    #Code only used if I want to debug and see how good the model is doing
    train_score_mae = mean_absolute_error(y_train, bayes.predict(X_train))
    train_score_mse = mean_squared_error(y_train, bayes.predict(X_train))
    test_score_mae = mean_absolute_error(y_test, bayes.predict(X_test))
    test_score_med_ae = median_absolute_error(y_test, bayes.predict(X_test))
    pct_right = np.mean(np.sign(y_test) == np.sign(bayes.predict(X_test)))
    max_test_score = max_error(y_test, bayes.predict(X_test))

    print(f"Train MAE is {train_score_mae}, Train MSE is {train_score_mse}")
    print(f"Test MAE is {test_score_mae}, Test MedAE is {test_score_med_ae}")
    print(f"Maximum error is {max_test_score}")
    print(f"The pct of correct predictions is {pct_right}")
    print(f"Score of estimator on non-2022 data is {-1*bayes.best_score_}")
    
    return (bayes, train_score_mae, test_score_mae)

In [257]:
data = pd.read_csv("../cleaned_data/Finalized Dataset.csv")
filtered_data = data.drop(columns = ['district']).assign(pvi = lambda x: x['pvi'] * 2, 
                                                         midterm = lambda x: x['year'] % 4 != 0)
X = filtered_data.drop(columns=['margin'])
y = filtered_data['margin']

"""param_dict_xgb = {
    'n_estimators': Integer(10, 250),
    'max_depth': Integer(3, 15),  # Reduced from 15
    'learning_rate': Real(0.001, 0.2, prior = 'log-uniform'),  # Reduced upper limit
    'subsample': Real(0.3, 1, prior = 'uniform'),  # Decreased
    'colsample_bytree': Real(0.3, 1, prior = 'uniform'),  # Decreased
    'min_child_weight': Integer(5, 15),  # Increased lower limit
    'gamma': Real(0.01, 100, prior = 'log-uniform'),  # Regularization
    'reg_alpha': Real(0.01, 100, prior = 'log-uniform'),  # Regularization
    'reg_lambda': Real(0.01, 100, prior = 'log-uniform')  # Regularization
}"""


param_dist_xgb = {
    'n_estimators': randint(10, 251),  # Discrete uniform distribution
    'max_depth': randint(3, 16),  # Discrete uniform distribution
    'learning_rate': uniform(0.001, 0.199),  # Continuous uniform distribution
    'subsample': uniform(0.3, 0.7),  # Continuous uniform distribution
    'colsample_bytree': uniform(0.3, 0.7),  # Continuous uniform distribution
    'min_child_weight': randint(5, 16),  # Discrete uniform distribution
    'gamma': uniform(0.01, 99.99),  # Continuous uniform distribution
    'reg_alpha': uniform(0.01, 99.99),  # Continuous uniform distribution
    'reg_lambda': uniform(0.01, 99.99)  # Continuous uniform distribution
}

xgb = xgboost.XGBRegressor(n_jobs = -1)
(bayes_model, train_score, test_score) = run_model(xgb, param_dist_xgb, X, y, iterations = 50)


param_dict_gbr = {
    'loss': Categorical(['squared_error', 'absolute_error']), 
    'learning_rate': Real(0.01, 1, prior='log-uniform'), 
    'max_iter': Integer(10, 200), 
    'max_leaf_nodes': Integer(10, 100), 
    'max_depth': Integer(5, 100), 
    'min_samples_leaf': Integer(5, 100), 
    'l2_regularization': Real(0.001, 1000, prior='log-uniform'), 
    'interaction_cst': Categorical(['pairwise', 'no_interactions'])
}

gbr = HistGradientBoostingRegressor()


param_dict_gbm = {
    'boosting_type': Categorical(['dart']),  # Stick to traditional to reduce complexity.
    'max_depth': Integer(2, 6),  # Lower max depth to control overfitting.
    'num_leaves': Integer(2, 8),  # Lower number of leaves to control overfitting.
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),  # Lower learning rates can lead to better generalization.
    'min_data_in_leaf': Integer(20, 40),  # Increase to provide a more conservative approach.
    'min_sum_hessian_in_leaf': Real(0.001, 0.1),  # Increasing this value can help with overfitting.
    'n_estimators': Integer(100, 300),  # Reducing the upper limit to prevent overfitting.
    'subsample_for_bin': Integer(20000, 200000),  # Adjust based on your data size and feature.
    'class_weight': Categorical([None]),  # Unless you have imbalanced classes, stick to None.
    'min_split_gain': Real(0.1, 1.0),  # Increase the minimum gain to reduce complex tree structures.
    'min_child_weight': Real(0.01, 1),  # Increase to add more constraints on the tree.
    'min_child_samples': Integer(20, 50),  # Increase to ensure more samples inform each split.
    'subsample': Real(0.5, 0.8),  # Decrease to add more randomness and reduce overfitting.
    'subsample_freq': Integer(1, 10),  # Ensure subsampling happens more regularly.
    'colsample_bytree': Real(0.5, 0.8),  # Decrease to add more randomness and reduce overfitting.
    'reg_alpha': Real(0.1, 10, prior='log-uniform'),  # Increase L1 regularization.
    'reg_lambda': Real(0.1, 10, prior='log-uniform')  # Increase L2 regularization.
}


gbm = lightgbm.LGBMRegressor(n_jobs = -1)

#run_model(gbr, param_dict_gbr, X, y, iterations=75)
#run_model(gbm, param_dict_gbm, X, y, iterations=40)


Fitting 4 folds for each of 50 candidates, totalling 200 fits
Train MAE is 4.947835442945541, Train MSE is 49.251908424160355
Test MAE is 6.820283865911831, Test MedAE is 5.522969699239104
Maximum error is 25.350468935011072
The pct of correct predictions is 0.94
Score of estimator on non-2022 data is 9.97431184110659


Testing HyperOpt

In [33]:
def HyperOptimize(X, y, multiplier = 1):

    X_train, X_test, y_train, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])

    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2006, 2), [2006, 2008]),
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    one_hot_fts = ['office_type', 'final_rating', 'open_seat']
    std_fts = ['incumbent_margin', 'covi_num','special', 'prev_gb_margin', 'prev2_gb_margin',
        'mean_specials_differential', 'pvi', 'previous_cci', 'current_cci',
        'previous_gas', 'current_gas',  'previous_unemployment',
        'current_unemployment', 'absenteeexcusereq', 'pollhours',
        'avgpollhours', 'maxpollhours', 'minpollhours', 'regdeadlines',
        'voteridlaws', 'novoterid', 'noallmailvote', 'noearlyvote',
        'nofelonreg', 'nofelonsregafterincar', 'nonstrictid', 'nonstrictphoto',
        'noonlineregistration', 'nopermanentabsentee', 'nopollplacereg', 'nopr',
        'nosamedayreg', 'nostateholiday', 'pr16', 'pr17', 'pr175', 'pr60',
        'pr90', 'strictid', 'strictphoto', 'house_chamber_margin',
        'senate_chamber_margin', 'change_cci', 'change_unemployment']
        
    preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), one_hot_fts), 
    ('num', 'passthrough', std_fts)])

    n_states = 2
    accuracies = np.zeros(n_states)
    errors = np.zeros(n_states)
    
    #Goes through each random state, and finds the best parameters and error for each one
    for random_state in range(0, n_states):
        def objective(params):
            clf = xgboost.XGBRegressor(**params, random_state = random_state, n_jobs=-1)
            pipe = make_pipeline(preprocessor, clf)

            cv_scores = []
            for train_idx, test_idx in cv.split(X_train):
                pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
                predictions = pipe.predict(X_train.iloc[test_idx])
                max_year = X_train.iloc[test_idx]['year'].max()
                score = (max_year - 2002) / 16 * incorrect_penalizer(y_train.iloc[test_idx], predictions, multiplier=multiplier)
                cv_scores.append(score)

            print(cv_scores)
            avg_mae = np.mean(cv_scores)
            return {'loss': avg_mae, 'status': STATUS_OK}


        space = {
            'n_estimators': hp.randint('n_estimators', 10, 251),
            'max_depth': hp.randint('max_depth', 2, 10),
            'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.2)),
            'subsample': hp.uniform('subsample', 0.3, 1),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
            'min_child_weight': hp.randint('min_child_weight', 5, 10),
            'gamma': hp.loguniform('gamma', np.log(5), np.log(20)),
            'reg_alpha': hp.loguniform('reg_alpha', np.log(0.1), np.log(10)),
            'reg_lambda': hp.loguniform('reg_lambda', np.log(0.1), np.log(10))
        }


        trials = Trials()
        best_params = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        trials=trials, 
                        early_stop_fn=no_progress_loss(iteration_stop_count=5), 
                        rstate=np.random.Generator(np.random.PCG64(random_state)))

        print("Best parameters:", best_params)
        acc_xgb = xgboost.XGBRegressor(**best_params, n_jobs=-1)
        pipe = make_pipeline(preprocessor, acc_xgb)
        pipe.fit(X_train, y_train)
        predictions = pipe.predict(X_test)
        errors[random_state] = median_absolute_error(y_test, predictions)
        accuracies[random_state] = np.mean(np.sign(y_test) == np.sign(predictions))

    print("Mean accuracy:", np.mean(accuracies))
    print("Mean error:", np.mean(errors))
    return np.mean(accuracies), np.mean(errors)

HyperOptimize(X, y, multiplier = 1)

[4.133133176073309, 6.509490945257898, 8.27693674326877, 7.450457470262208]
[9.018630506572016, 14.582932055926348, 20.772794212389055, 24.435695357558117]                                  
[7.1021192763455385, 11.93936630018348, 16.372471657337062, 18.599882900339047]                                  
[5.154632127337833, 9.261376521503752, 10.831056858367742, 13.218816728108353]                                   
[9.464046167150084, 15.06618171029456, 21.63934676656595, 25.69906711780782]                                     
  0%|          | 5/9223372036854775807 [00:07<4000616549456252:01:04,  1.56s/trial, best loss: 6.592504583715546]
Best parameters: {'colsample_bytree': 0.4547867536333304, 'gamma': 12.010375674360388, 'learning_rate': 0.02202605484969771, 'max_depth': 5, 'min_child_weight': 9, 'n_estimators': 142, 'reg_alpha': 0.24043206714908386, 'reg_lambda': 0.2391480567734706, 'subsample': 0.7775845345223572}
[4.569284647179207, 6.261917112804594, 8.43490559978272, 7.653201245

(0.9459770114942528, 5.824779026270237)

In [32]:
multipliers = np.linspace(1, 5, 10)
accuracy_scores = []
median_errors = []

for multiplier in multipliers:
    HyperOptimize(X, y, multiplier)
    
    

[4.171101380878282, 6.5865965443073, 8.275412054249987, 7.338798270486502]
[9.010894916669361, 14.530545147834024, 20.742816081308618, 24.381620082214035]                                  
[7.101596134192656, 11.941281858551243, 16.37303221004745, 18.594731796331853]                                   
[4.721484304061629, 7.974138844969804, 10.426546883203311, 10.314629735929726]                                   
  0%|          | 4/9223372036854775807 [00:07<5032655455056182:02:40,  1.96s/trial, best loss: 6.592977062480518]


KeyboardInterrupt: 