In [7]:
import pandas as pd
import numpy as np
import pickle as pkl
import lightgbm as lgb
import xgboost
import re
import time

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import  accuracy_score, median_absolute_error, make_scorer

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss

class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 


In [8]:
np.seterr(divide='ignore', invalid='ignore')

#Categorical features that need to be one-hot encoded    
one_hot_fts = ['state', 'office_type']

#Rating is the only ordinal feature
ordinal_fts = ['final_rating']
ordinal_fts_ranking = ['Safe R', 'Likely R', 'Leans R', 'Toss-up', 'Leans D', 'Likely D', 'Safe D']

#Cont features that should be pass-throughed (and later scaled)
cont_fts = [
    "open_seat", "incumbent_differential", "special", "absenteeexcusereq", "pollhours", "avgpollhours", "minpollhours",
    "regdeadlines", "voteridlaws", "novoterid", "noallmailvote", "noearlyvote", "nofelonreg",
    "nofelonsregafterincar", "nonstrictid", "nonstrictphoto", "nopollplacereg", "nopr", "nosamedayreg",
    "nostateholiday", "pr16", "pr17", "pr175", "pr60", "pr90", "strictid", "strictphoto", "covi_num",
    "prev_dem_gen_tp", "prev_gen_margin", "weighted_genpoll", "weighted_genpoll_lower",
    "weighted_genpoll_upper", "unweighted_genpoll", "mean_specials_differential", 
    "house_chamber_margin", "senate_chamber_margin", "previous_cci", "current_cci", "change_cci",
    "previous_gas", "current_gas", "change_gas", "previous_unemployment", "current_unemployment",
    "change_unemployment",  "receipts", "from_committee_transfers", "disbursements",
    "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
    "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted", "num_polls",
    "unweighted_estimate", "unweighted_ci_lower", "unweighted_ci_upper", "weighted_estimate",
    "weighted_ci_lower", "weighted_ci_upper", "white_pct", "black_pct", "asian_pct", "hispanic_pct",
    "median_income", "impoverished_pct", "median_age", "renting_pct", "inflation", "isMidterm",
    "genballot_predicted_margin", "genballot_predicted_lower", "genballot_predicted_upper",
    "poll_fundamental_agree",  'receipts_DEM', 'receipts_REP', 'disbursements_DEM', 'disbursements_REP'
]

def optima_model(model, param_dict, X, y, penalizing_factor = 4):
    """Performs hyperparameter optimization for a a given bootstrapped X 
    ## Parameters:
    model: sklearnable model. We use LGBMRegressor 
    param_dict: dictionary of hyperparameters to optimize
    X: DataFrame with features
    y: Series with target variable"""
    
    def penalize_wrong(y, y_pred, penalty):
        return np.mean(abs(y_pred - y) * (1 + penalty * (np.sign(y_pred) != np.sign(y))))
    
    X_other, y_other = X.loc[X['year'] <= 2022, :], y.loc[X['year'] <= 2022]
    X_train, X_test, y_train, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])
    
    
    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    #Preprocessing data: no need to scale data, because we use tree-based models which are monotonic-scale-invariant
    #Because we don't need to scale data, we don't have to include the column transformer in the final saved model
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])
    
    def objective(params):
        "Function that takes in hyperparameters and returns loss, that Hyperopt will minimize."        
        training_loss = []
        testing_loss = []
        accuracies = []
        for train_idx, test_idx in cv.split(X_train):
            
            reg = model(**params)
            pipe = Pipeline(steps = [
                ('preprocessing', preprocessor), 
                ('model', reg)])
            
            """Goes through each fold and calculates loss.
            Note: We use median absolute error because it is more robust to outliers than mean absolute error.
            We also expect earlier folds to have higher error, since they have less data to train on."""
            pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
            
            predictions = pipe.predict(X_train.iloc[test_idx])
            training_preds = pipe.predict(X_train.iloc[train_idx])
            testing_loss.append(penalize_wrong(y_train.iloc[test_idx], predictions, penalizing_factor))
            training_loss.append(penalize_wrong(y_train.iloc[train_idx], training_preds, penalizing_factor))
            accuracies.append(accuracy_score(np.sign(y_train.iloc[test_idx]), np.sign(predictions)))
         
        return {'loss': np.mean(testing_loss), 'status': STATUS_OK}


    start_time = time.time()
    max_time = 120 #about two minutes per run-through
    def stop(trial, elapsed_time=0):
        return elapsed_time > max_time, [time.time() - start_time] 
    
    "Hyperopt uses the TPE algorithm to optimize hyperparameters. We use the no_progress_loss function to stop early if we don't see progress."
    best_params = fmin(fn=objective,
                    space=param_dict,
                    algo=tpe.suggest,
                    trials=Trials(),
                    early_stop_fn = stop)
                    
                    
    print("Best parameters:", best_params)
    best_model = model(**best_params)
    pipe = Pipeline(steps = [
        ('preprocessing', preprocessor), 
        ('model', best_model)])
    
    #Returns a fitted ML algortithm with those hyperparameters
    pipe.fit(X_other, y_other) 
    #Returns the final model   
    return pipe.named_steps['model']

In [10]:
#Bootstraps X and y, and then runs the optimization function
def bootstrap(group, n=None):
    if n is None:
        n = len(group)
    return group.sample(n, replace=True)

data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])

names_for_monotonicity = preprocessor.fit(data.drop(columns=['margin'])).get_feature_names_out()
before_processing_monotonic_columns = ['incumbent_differential', 'pvi', 'receipts_ratio', 'disbursements_ratio', 
                                       'genballot_predicted_margin', 'specials_predicted_margin', 'unweighted_estimate', 'unweighted_ci_lower',
                                       'unweighted_ci_upper','weighted_estimate', 'weighted_ci_lower', 'weighted_ci_upper',
                                       'phone_unweighted', 'online_unweighted', 'receipts_genballot_interaction',
                                       'disbursements_genballot_interaction', 'poll_fundamental_average']

monotonic_columns = ['num__' + name for name in before_processing_monotonic_columns] + ['ord__final_rating']
monotone_constraints = [1 if name in monotonic_columns else 0 for name in names_for_monotonicity]

# Define the search space for Hyperopt
param_dist_lgbm = {
    'boosting_type': 'dart',
    'num_leaves': hp.randint('num_leaves', 20, 70),  # Reduced the upper limit, 
    'n_estimators': hp.randint('n_estimators', 50, 200),  # Increased the range
    'learning_rate': hp.loguniform('learning_rate', -5, -2),  # Equivalent to about 0.0001 to 0.01
    'subsample_for_bin': hp.randint('subsample_for_bin', 20000, 200000),  # Narrowed the range
    'min_data_in_bin': hp.randint('min_data_in_bin', 1, 10), 
    'min_data_in_leaf': hp.randint('min_data_in_leaf', 1, 10),  # Reduced the upper limit
    'min_child_samples': hp.randint('min_child_samples', 20, 150),  # Increased the range for more regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.5),  # Increased upper limit for L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.5),  # Increased upper limit for L2 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),  # Reduced the upper limit
    'subsample': hp.uniform('subsample', 0.5, 0.8),  # Reduced the upper limit for more randomness
    'max_depth': hp.randint('max_depth', 2, 10),  # Added max_depth for additional control
    "verbose": -1,  # Keep verbose to -1 to reduce log clutter, 
    'monotone_constraints': monotone_constraints
}


num_trials = 1000
for idx in range(num_trials):
    bootstrapped_data = data.groupby(['year', 'office_type']).apply(bootstrap).reset_index(drop=True)
    
    bootstrapped_X = bootstrapped_data.drop(columns=['margin'])
    bootstrapped_y = bootstrapped_data['margin']
    
    trained_lgbm = optima_model(lgb.LGBMRegressor, param_dist_lgbm, bootstrapped_X, bootstrapped_y)
    file_path = f"../models/Model_{idx}.pkl"

    # Open a file to write in binary mode????        
    with open(file_path, 'wb') as file:
        pkl.dump(trained_lgbm, file)


  0%|          | 9/9223372036854775807 [00:50<14268253935650313:06:08,  5.57s/trial, best loss: 9.292528501630335] 


KeyboardInterrupt: 

In [11]:
data.loc[(data['year'] == 2022)]

Unnamed: 0,Unnamed0,year,state,district,office_type,open_seat,incumbent_differential,margin,special,absenteeexcusereq,...,genballot_predicted_margin,genballot_predicted_lower,genballot_predicted_upper,specials_predicted_margin,receipts_genballot_interaction,disbursements_genballot_interaction,democrat_in_presidency,gas_democrat_interaction,cci_democrat_interaction,poll_fundamental_agree
3745,3746,2022,AK,1,House,True,0.000000,-0.313342,False,0.0,...,-17.872868,-19.171587,-16.573529,-12.826667,-9.125252,-6.884324,True,3.769,58.6,-1.0
3746,3747,2022,AL,2,House,False,1.081648,-39.930439,False,1.0,...,-35.391220,-36.689939,-34.091882,-30.345019,82.593152,66.754214,True,3.769,58.6,
3747,3748,2022,AL,3,House,False,-2.500547,-46.082056,False,1.0,...,-43.053414,-44.352134,-41.754076,-38.007213,233.573105,210.000879,True,3.769,58.6,
3748,3749,2022,AL,4,House,False,-2.897236,-70.484282,False,1.0,...,-71.210103,-72.508823,-69.910765,-66.163902,300.501716,268.725624,True,3.769,58.6,
3749,3750,2022,AL,5,House,True,0.000000,-37.535854,False,1.0,...,-35.172868,-36.471587,-33.873529,-30.126667,153.421936,159.031281,True,3.769,58.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,5267,2022,TN,0,Governor,False,5.148912,-32.000000,False,1.0,...,-25.241511,-26.540231,-23.942173,-20.195310,,,True,3.769,58.6,1.0
5267,5268,2022,TX,0,Governor,False,-1.701332,-10.900000,False,1.0,...,-14.133995,-15.432714,-12.834656,-9.087794,,,True,3.769,58.6,1.0
5268,5269,2022,VT,0,Governor,False,-72.513675,-47.200000,False,0.0,...,-43.301650,-44.600370,-42.002312,-38.255450,,,True,3.769,58.6,1.0
5269,5270,2022,WI,0,Governor,False,0.455868,3.400000,False,0.0,...,-5.096188,-6.394908,-3.796850,-0.049988,,,True,3.769,58.6,1.0
