In [18]:
import pandas as pd
import numpy as np
import pickle as pkl
import lightgbm as lgb
import re
import time
from quantile_forest import RandomForestQuantileRegressor
from copy import deepcopy

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import  accuracy_score, median_absolute_error, make_scorer, mean_pinball_loss, log_loss, mean_squared_error
from sklearn.base import clone

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss

class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 

In [19]:
np.seterr(divide='ignore', invalid='ignore')

#Categorical features that need to be one-hot encoded    
one_hot_fts = ['office_type']

#Rating is the only ordinal feature
ordinal_fts = ['final_rating']
ordinal_fts_ranking = ['Safe R', 'Likely R', 'Leans R', 'Toss-up', 'Leans D', 'Likely D', 'Safe D']

#Cont features that should be pass-throughed (aznd later scaled)
cont_fts = [
    "open_seat", "incumbent_differential", "special", "absenteeexcusereq", "pollhours", "avgpollhours", "minpollhours",
    "regdeadlines", "voteridlaws", "novoterid", "noallmailvote", "noearlyvote", "nofelonreg",
    "nofelonsregafterincar", "nonstrictid", "nonstrictphoto", "nopollplacereg", "nopr", "nosamedayreg",
    "nostateholiday", "pr16", "pr17", "pr175", "pr60", "pr90", "strictid", "strictphoto", "covi_num",
    "prev_dem_gen_tp", "prev_gen_margin", "weighted_genpoll", "weighted_genpoll_lower",
    "weighted_genpoll_upper", "unweighted_genpoll", "mean_specials_differential", 
    "house_chamber_margin", "senate_chamber_margin", "previous_cci", "current_cci", "change_cci",
    "previous_gas", "current_gas", "change_gas", "previous_unemployment", "current_unemployment",
    "change_unemployment",  "receipts", "from_committee_transfers", "disbursements",
    "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
    "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted", "num_polls",
    "unweighted_estimate", "unweighted_ci_lower", "unweighted_ci_upper", "weighted_estimate",
    "weighted_ci_lower", "weighted_ci_upper", "white_pct", "black_pct", "asian_pct", "hispanic_pct",
    "median_income", "impoverished_pct", "median_age", "renting_pct", "inflation", "isMidterm",
    "genballot_predicted_margin", "genballot_predicted_lower", "genballot_predicted_upper",
    "poll_fundamental_agree",  'receipts_DEM', 'receipts_REP', 'disbursements_DEM', 'disbursements_REP'
]


def optima_model(model_mean, model_std, param_dict, X, y, penalizing_factor = 4, **kwargs):
    """Performs hyperparameter optimization for a a given bootstrapped X 
    ## Parameters:
    model: sklearnable model. We use LGBMRegressor 
    param_dict: dictionary of hyperparameters to optimize
    X: DataFrame with features
    y: Series with target variable"""
    
    X_other, y_other = X.loc[X['year'] <= 2022, :], y.loc[X['year'] <= 2022]
    X_train, X_test, y_train, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])
    
    def penalize_wrong(y_true, y_pred, penalizing_factor):
        """Penalizes wrong predictions by penalizing_factor"""
        return np.mean(np.abs(y_true - y_pred) * (1 + penalizing_factor * (y_true != np.sign(y_pred))))
    
    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    #Preprocessing data: no need to scale data, because we use tree-based models which are monotonic-scale-invariant
    #Because we don't need to scale data, we don't have to include the column transformer in the final saved model
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])
    
    def objective_mean(params):
        "Function that takes in hyperparameters and returns loss, that Hyperopt will minimize."        
        training_loss = []
        testing_loss = []
        accuracies = []
        for train_idx, test_idx in cv.split(X_train):   
            mean_reg = model_mean(**params)
            mean_pipe = Pipeline(steps = [
                ('preprocessing', preprocessor), 
                ('model', mean_reg)])
            
            """Goes through each fold and calculates loss.
            Note: We use median absolute error because it is more robust to outliers than mean absolute error.
            We also expect earlier folds to have higher error, since they have less data to train on."""
            pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
            
            predictions = pipe.predict(X_train.iloc[test_idx])
            training_preds = pipe.predict(X_train.iloc[train_idx])
            testing_loss.append(mean_squared_error(y_train.iloc[test_idx], predictions, squared = False))
            training_loss.append(penalize_wrong(y_train.iloc[train_idx], training_preds, penalizing_factor))
            accuracies.append(accuracy_score(np.sign(y_train.iloc[test_idx]), np.sign(predictions)))
         
        return {'loss': np.mean(testing_loss), 'status': STATUS_OK}


    start_time = time.time()
    max_time = 120 #about two minutes per run-through
    def stop(trial, elapsed_time=0):
        return elapsed_time > max_time, [time.time() - start_time] 
    
    "Hyperopt uses the TPE algorithm to optimize hyperparameters. We use the no_progress_loss function to stop early if we don't see progress."
    best_params = fmin(fn=objective,
                    space=param_dict,
                    algo=tpe.suggest,
                    trials=Trials(),
                    early_stop_fn = stop)
                    
                    
    print("Best parameters pre-placing:", best_params)
    best_model = model(**best_params, **kwargs)
    print("Parameters post-placing:", best_model.get_params())
    pipe = Pipeline(steps = [
        ('preprocessing', preprocessor), 
        ('model', best_model)])
    
    #Returns a fitted ML algortithm with those hyperparameters
    pipe.fit(X_train, y_train) 
    #Returns the final model   
    return pipe

In [20]:
#Bootstraps X and y, and then runs the optimization function
def bootstrap(group, n=None):
    probs = np.random.exponential(1, len(group))
    probs = probs / sum(probs)
    if n is None:
        n = len(group)
    return group.sample(n, replace=True, weights=probs)

data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])

names_for_monotonicity = preprocessor.fit(data.drop(columns=['margin'])).get_feature_names_out()
before_processing_monotonic_columns = ['incumbent_differential', 'receipts', 'disbursements', 
                                       'genballot_predicted_margin', 'specials_predicted_margin', 'unweighted_estimate', 'unweighted_ci_lower',
                                       'unweighted_ci_upper','weighted_estimate', 'weighted_ci_lower', 'weighted_ci_upper',
                                       'phone_unweighted', 'online_unweighted', 'receipts_genballot_interaction',
                                       'disbursements_genballot_interaction', 'poll_fundamental_average']

monotonic_columns = ['num__' + name for name in before_processing_monotonic_columns] + ['ord__final_rating']
monotone_constraints = [1 if name in monotonic_columns else 0 for name in names_for_monotonicity]

# Define the search space for Hyperopt
param_dist_lgbm = {
    'boosting_type': 'dart',
    'num_leaves': hp.randint('num_leaves', 20, 70),  # Reduced the upper limit, 
    'n_estimators': hp.randint('n_estimators', 50, 200),  # Increased the range
    'learning_rate': hp.loguniform('learning_rate', -5, -2),  # Equivalent to about 0.0001 to 0.01
    'subsample_for_bin': hp.randint('subsample_for_bin', 20000, 200000),  # Narrowed the range
    'min_data_in_bin': hp.randint('min_data_in_bin', 1, 10), 
    'min_data_in_leaf': hp.randint('min_data_in_leaf', 1, 10),  # Reduced the upper limit
    'min_child_samples': hp.randint('min_child_samples', 20, 150),  # Increased the range for more regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.5),  # Increased upper limit for L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.5),  # Increased upper limit for L2 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),  # Reduced the upper limit
    'subsample': hp.uniform('subsample', 0.5, 0.8),  # Reduced the upper limit for more randomness
    'max_depth': hp.randint('max_depth', 2, 10),  # Added max_depth for additional control
    "verbose": -1,  # Keep verbose to -1 to reduce log clutter, 
    'monotone_constraints': monotone_constraints, 
    'monotone_constraints_method': 'advanced'
}


X = data.drop(columns=['margin'])
y = data['margin']
trained_lgbm = optima_model(lgb.LGBMRegressor, param_dist_lgbm, X, y, 
                                boosting_type = 'dart', monotone_constraints = monotone_constraints, verbosity = -1, 
                                 monotone_constraints_method = 'advanced')
"""num_trials = 1
for idx in range(num_trials):
    bootstrapped_data = data.groupby(['year', 'office_type']).apply(bootstrap).reset_index(drop=True)
    
    bootstrapped_X = bootstrapped_data.drop(columns=['margin'])
    bootstrapped_y = bootstrapped_data['margin']
    
    trained_lgbm = optima_model(lgb.LGBMRegressor, param_dist_lgbm, bootstrapped_X, bootstrapped_y, 
                                boosting_type = 'dart', monotone_constraints = monotone_constraints, verbosity = -1)
    file_path = f"../models/Model_{idx}.pkl"

    # Open a file to write in binary mode????        
    with open(file_path, 'wb') as file:
        pkl.dump(trained_lgbm, file)"""


  0%|          | 12/9223372036854775807 [02:42<34670257097068311:53:36, 13.53s/trial, best loss: 8.594113325510568]
Best parameters pre-placing: {'colsample_bytree': 0.606575319505533, 'learning_rate': 0.07600029323759298, 'max_depth': 4, 'min_child_samples': 121, 'min_data_in_bin': 5, 'min_data_in_leaf': 4, 'n_estimators': 187, 'num_leaves': 26, 'reg_alpha': 0.20756314283998195, 'reg_lambda': 0.01811235338890299, 'subsample': 0.5547541265803874, 'subsample_for_bin': 198362}
Parameters post-placing: {'boosting_type': 'dart', 'class_weight': None, 'colsample_bytree': 0.606575319505533, 'importance_type': 'split', 'learning_rate': 0.07600029323759298, 'max_depth': 4, 'min_child_samples': 121, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 187, 'n_jobs': None, 'num_leaves': 26, 'objective': None, 'random_state': None, 'reg_alpha': 0.20756314283998195, 'reg_lambda': 0.01811235338890299, 'subsample': 0.5547541265803874, 'subsample_for_bin': 198362, 'subsample_freq': 0, 'm

'num_trials = 1\nfor idx in range(num_trials):\n    bootstrapped_data = data.groupby([\'year\', \'office_type\']).apply(bootstrap).reset_index(drop=True)\n    \n    bootstrapped_X = bootstrapped_data.drop(columns=[\'margin\'])\n    bootstrapped_y = bootstrapped_data[\'margin\']\n    \n    trained_lgbm = optima_model(lgb.LGBMRegressor, param_dist_lgbm, bootstrapped_X, bootstrapped_y, \n                                boosting_type = \'dart\', monotone_constraints = monotone_constraints, verbosity = -1)\n    file_path = f"../models/Model_{idx}.pkl"\n\n    # Open a file to write in binary mode????        \n    with open(file_path, \'wb\') as file:\n        pkl.dump(trained_lgbm, file)'

In [91]:
from scipy.stats import multivariate_normal
def create_noise(data, correlation_matrix, noise_level = 0.000001): 
        fts = [
                "weighted_genpoll", "weighted_genpoll_lower",
                "weighted_genpoll_upper", "unweighted_genpoll", "receipts", "from_committee_transfers", "disbursements",
                "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
                "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted",
                "unweighted_estimate", "unweighted_ci_lower", "unweighted_ci_upper", "weighted_estimate",
                "weighted_ci_lower", "weighted_ci_upper", 
                "genballot_predicted_margin", "genballot_predicted_lower", "genballot_predicted_upper"
        ]
        noisy_df = deepcopy(data)[fts]
        #Setting all to zero so we can add noise
        noisy_df.loc[:, :] = 0
        #Calculate variance for each feature that does not have a self-contained correlation (like polls do, given meta-analyses)
       
        """for col in ["receipts", "from_committee_transfers", "disbursements",
                "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
                "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted"]:
                standard_deviation = noise_level * np.std(data[col])
                
                print(f"Variance for {col} is {standard_deviation}")
                #We want to create a multivariate normal distribution with the given correlation matrix
                diag_var_matrix = np.eye(len(data)) * standard_deviation
                cov_matrix = diag_var_matrix @ correlation_matrix @ diag_var_matrix
                noise = multivariate_normal.rvs(mean = noisy_df[col], cov = cov_matrix)
                noisy_df.loc[:, col] = noise
                print(noise) """
                
        # Preparing to add noise
        features = ["receipts", "from_committee_transfers", "disbursements",
                "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
                "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted"]
        
        for col in features:
                # Generate initial independent noise for each feature
                print(data[col].std())
                initial_noise = np.random.normal(0, noise_level * data[col].std(), data[col].shape)
                print(initial_noise)
                # Reshape noise to (len(data), 1) to apply correlation matrix
                reshaped_noise = initial_noise.reshape(-1, 1)
                
                # Apply the correlation matrix to induce correlated noise across races
                correlated_noise = correlation_matrix @ reshaped_noise
                
                # Flatten the noise back to original shape and add to data
                noisy_df[col] = data[col] + correlated_noise.ravel()
                #print(noisy_df)


        #Calculate variance for each feature that has a self-contained correlation
        var_weighted_genpoll = np.mean((data['weighted_genpoll_upper'] - data['weighted_genpoll'])/1.96)
        std_unweighted_estimate = np.square((data['unweighted_ci_upper'] - data['unweighted_ci_lower'])/1.96)
        std_weighted_estimate = np.square((data['weighted_ci_upper'] - data['weighted_ci_lower'])/1.96)
        
        
        
        
        
        
X_2024 = data.loc[data['year'] == 2024].drop(columns=['margin'])
with open(f"../bootstrapped_models_testing/Model_1.pkl", 'rb') as file:
        model = pkl.load(file)
contributions = model.predict(X_2024, pred_contrib = True)
create_noise(X_2024, np.corrcoef(contributions), noise_level = 0.0001)

4.359549764576093
[ 6.14820848e-01 -2.46447340e-01 -5.12583052e-01  2.76574640e-01
  5.96477574e-01  3.97137379e-01 -3.03227749e-01  9.53090796e-02
 -2.40061491e-01 -2.04705433e-02 -5.85291777e-01 -1.50405477e-01
  3.99367574e-01 -1.05600867e-01 -4.92571650e-01  1.33343361e-01
  2.82649884e-01 -3.72012266e-02  5.63642754e-01 -9.12443043e-02
  3.88965646e-01  1.55414895e-01  1.82312652e-02  1.93531013e-01
  3.43188440e-01 -5.07829002e-01 -4.26024665e-01 -2.03662923e-01
 -6.92821565e-01  6.80344261e-01  3.95218602e-01  3.20396496e-01
  1.45448111e-01 -3.76201423e-01 -6.53113870e-01  3.59082870e-02
  8.46205140e-01  6.54990995e-01 -3.89571676e-01  8.36802144e-01
 -1.29613123e-01 -6.71957596e-01  6.73469356e-01  1.57321921e-01
  6.24007047e-02  6.15191264e-01 -1.08437490e-01  6.99000217e-01
  2.91199542e-01  4.01163728e-01 -1.68532675e-01 -5.40437884e-02
  1.34650596e+00  3.33789199e-01  8.91621472e-02  3.97645509e-01
 -5.05146672e-01  6.65218312e-02 -3.57439554e-02  3.41599952e-01
 -4.721

In [81]:
np.corrcoef(contributions)

array([[ 1.        ,  0.66748909,  0.99578463, ...,  0.92801791,
         0.9603535 , -0.8863052 ],
       [ 0.66748909,  1.        ,  0.67077219, ...,  0.60440783,
         0.67674305, -0.76031295],
       [ 0.99578463,  0.67077219,  1.        , ...,  0.92915821,
         0.96946533, -0.90347179],
       ...,
       [ 0.92801791,  0.60440783,  0.92915821, ...,  1.        ,
         0.96197128, -0.85855456],
       [ 0.9603535 ,  0.67674305,  0.96946533, ...,  0.96197128,
         1.        , -0.93909064],
       [-0.8863052 , -0.76031295, -0.90347179, ..., -0.85855456,
        -0.93909064,  1.        ]])