In [10]:
import pandas as pd
import numpy as np
#import shap
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, BaseCrossValidator
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, accuracy_score, median_absolute_error, make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss
import xgboost
import lightgbm
from scipy.stats import loguniform, randint, uniform
from scipy.spatial.distance import mahalanobis
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence
from hpsklearn import HyperoptEstimator, any_regressor, any_preprocessing
from hyperopt import tpe


#Creating a custom time series cross-validator
class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years)

In [11]:
def no_sklearn_model(model, param_dict, X, y, iterations = 50):
   "Creating a custom CV to look through errors in a detailed manner"
   X_other, X_test, y_other, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                    y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])

   models = []
   test_scores = []
   for random_state in range(1, iterations):
      
      one_hot_fts = ['office_type', 'final_rating', 'open_seat']
      std_fts = ['midterm', 'incumbent_margin', 'covi_num','special', 'prev_gb_margin', 'prev2_gb_margin',
         'mean_specials_differential', 'pvi', 'previous_cci', 'current_cci',
         'previous_gas', 'current_gas',  'previous_unemployment',
         'current_unemployment', 'absenteeexcusereq', 'pollhours',
         'avgpollhours', 'maxpollhours', 'minpollhours', 'regdeadlines',
         'voteridlaws', 'novoterid', 'noallmailvote', 'noearlyvote',
         'nofelonreg', 'nofelonsregafterincar', 'nonstrictid', 'nonstrictphoto',
         'noonlineregistration', 'nopermanentabsentee', 'nopollplacereg', 'nopr',
         'nosamedayreg', 'nostateholiday', 'pr16', 'pr17', 'pr175', 'pr60',
         'pr90', 'strictid', 'strictphoto', 'house_chamber_margin',
         'senate_chamber_margin', 'change_cci', 'change_unemployment']
         
      preprocessor = ColumnTransformer([
      ('cat', OneHotEncoder(), one_hot_fts), 
      ('num', 'passthrough', std_fts)])
      
      parameters = {key: value.rvs(random_state=random_state) for key, value in param_dict.items()}
      model = model.set_params(**parameters)
      
      pipe = make_pipeline(preprocessor, model)
      models.append(pipe)
      
      folds = [(range(2002, 2006, 2), [2006, 2008]),
         (range(2006, 2010, 2), [2010, 2012]),
         (range(2010, 2014, 2), [2014, 2016]),
         (range(2014, 2018, 2), [2018, 2020])]
      
      fold_scores = []
      for train, test in folds:
            X_train, X_val = X_other.loc[X['year'].isin(train), :], X_other.loc[X['year'].isin(test), :]
            y_train, y_val = y_other[X_other['year'].isin(train)], y_other[X_other['year'].isin(test)]
            
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_val)
            fold_scores.append(median_absolute_error(y_val, y_pred))
            #print(f'Mean absolute error for {train} is {mean_absolute_error(y_val, y_pred)}')
            #print(f'Mean squared error for {train} is {mean_squared_error(y_val, y_pred)}')
            #print(f'Max error for {train} is {max_error(y_val, y_pred)}')
            #print(f'Median absolute error for {train} is {median_absolute_error(y_val, y_pred)}')
            #print('---------------------------------------------------------')
      test_scores.append(np.mean(fold_scores))
   
   
   best_model = models[np.argmin(test_scores)]
   best_model.fit(X_other, y_other)
   train_score = mean_absolute_error(y_other, best_model.predict(X_other))
   val_score = test_scores[np.argmin(test_scores)]
   test_score = mean_absolute_error(y_test, best_model.predict(X_test))

   print("Results for Manual Cross-Validation:")
   print(f"training score is {train_score}")
   print(f"validation score is {val_score}")
   print(f"test_score is {test_score}")
   return best_model, train_score, val_score, test_score


In [12]:
def sklearn_model(model, param_dict, X, y, iterations = 75):
    """Runs through a given model to get the best estimator of that model, as well as the train/test score values."""
    X_other, X_test, y_other, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])
    
    folds = [(range(2002, 2006, 2), [2006, 2008]),
        (range(2006, 2010, 2), [2010, 2012]),
        (range(2010, 2014, 2), [2014, 2016]),
        (range(2014, 2018, 2), [2018, 2020])]

    cv = CustomTimeSeriesCV(folds)
        
    one_hot_fts = ['office_type', 'final_rating', 'open_seat']
    std_fts = ['midterm', 'incumbent_margin', 'covi_num','special', 'prev_gb_margin', 'prev2_gb_margin',
       'mean_specials_differential', 'pvi', 'previous_cci', 'current_cci',
       'previous_gas', 'current_gas',  'previous_unemployment',
       'current_unemployment', 'absenteeexcusereq', 'pollhours',
       'avgpollhours', 'maxpollhours', 'minpollhours', 'regdeadlines',
       'voteridlaws', 'novoterid', 'noallmailvote', 'noearlyvote',
       'nofelonreg', 'nofelonsregafterincar', 'nonstrictid', 'nonstrictphoto',
       'noonlineregistration', 'nopermanentabsentee', 'nopollplacereg', 'nopr',
       'nosamedayreg', 'nostateholiday', 'pr16', 'pr17', 'pr175', 'pr60',
       'pr90', 'strictid', 'strictphoto', 'house_chamber_margin',
       'senate_chamber_margin', 'change_cci', 'change_unemployment']
        
    preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), one_hot_fts), 
    ('num', 'passthrough', std_fts)])
    
    model_name = model.__class__.__name__
        
    param_dict = {f"{model_name.lower()}__{key}": value for key, value in param_dict.items()}
    
    pipe = make_pipeline(preprocessor, model)
    
    grid = RandomizedSearchCV(pipe, param_dict, n_iter=iterations, scoring='neg_median_absolute_error', cv = cv, verbose = 1)
    grid.fit(X_other, y_other)
    train_score_mae = mean_absolute_error(y_other, grid.predict(X_other))
    val_score_mae = grid.best_score_
    test_score_mae = mean_absolute_error(y_test, grid.predict(X_test))
    
    print("Results for Sklearn Cross-Validation:")
    print(f"training score is {train_score_mae}")
    print(f"validation score is {val_score_mae}")
    print(f"test_score is {test_score_mae}")
    
    return (grid, train_score_mae, val_score_mae, test_score_mae)

In [None]:
data = pd.read_csv("../cleaned_data/Finalized Dataset.csv")
filtered_data = data.drop(columns = ['district']).assign(pvi = lambda x: x['pvi'] * 2, 
                                                         midterm = lambda x: x['year'] % 4 != 0)
X = filtered_data.drop(columns=['margin'])
y = filtered_data['margin']

param_dist_xgb = {
    'n_estimators': randint(10, 251),  # Discrete uniform distribution
    'max_depth': randint(3, 16),  # Discrete uniform distribution
    'learning_rate': uniform(0.001, 0.199),  # Continuous uniform distribution
    'subsample': uniform(0.3, 0.7),  # Continuous uniform distribution
    'colsample_bytree': uniform(0.3, 0.7),  # Continuous uniform distribution
    'min_child_weight': randint(5, 16),  # Discrete uniform distribution
    'gamma': uniform(0.01, 99.99),  # Continuous uniform distribution
    'reg_alpha': uniform(0.01, 99.99),  # Continuous uniform distribution
    'reg_lambda': uniform(0.01, 99.99)  # Continuous uniform distribution
}

xgb = xgboost.XGBRegressor(n_jobs = -1)
#no_sklearn_model(xgb, param_dist_xgb, X, y, iterations = 50)
(grid, _, _, _) = sklearn_model(xgb, param_dist_xgb, X, y, iterations = 50)
print(grid.best_estimator_.get_params())

In [None]:
hyperopt = xgboost.XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8037356348820666, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=7.392364374366169, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.04199619532047469, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=7, missing=nan, monotone_constraints=None,
             n_estimators=68, n_jobs=-1, num_parallel_tree=None, predictor=None,
             random_state=None, ...)

sklearn = xgboost.XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.7724255609402491, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=14.095496265247299, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.152283064794593, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=10, monotone_constraints=None,
             n_estimators=107, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=None)
