In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import accuracy_score
import re
import pickle as pkl
import lightgbm as lgb


from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss

#Creating a custom time series cross-validator
class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 

def penalize_wrong(y_true, y_pred, penalty = 4):
    return np.mean(np.abs(y_true - y_pred)*(1+penalty*(np.sign(y_true)
                                               != np.sign(y_pred))))


In [2]:
np.seterr(divide='ignore', invalid='ignore')

#Categorical features that need to be one-hot encoded    
one_hot_fts = ['office_type']

#This model is used twice: once without reverse causality (polls/expert ratings) for campaign finance, one with (for the real preds). This is the former.
cont_fts_no_reverse_causality = [
    "open_seat", "incumbent_differential", "special", "absenteeexcusereq", "pollhours", "avgpollhours", "minpollhours",
    "regdeadlines", "voteridlaws", "novoterid", "noallmailvote", "noearlyvote", "nofelonreg",
    "nofelonsregafterincar", "nonstrictid", "nonstrictphoto", "nopollplacereg", "nopr", "nosamedayreg",
    "nostateholiday", "pr16", "pr17", "pr175", "pr60", "pr90", "strictid", "strictphoto", "covi_num",
    "prev_dem_gen_tp", "prev_gen_margin", "weighted_genpoll", "weighted_genpoll_lower",
    "weighted_genpoll_upper", "unweighted_genpoll", "mean_specials_differential", 
    "house_chamber_margin", "senate_chamber_margin", "previous_cci", "current_cci", "change_cci",
    "previous_gas", "current_gas", "change_gas", "previous_unemployment", "current_unemployment",
    "change_unemployment",  "receipts", "from_committee_transfers", "disbursements",
    "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
    "individual_contributions", "white_pct", "black_pct", "asian_pct", "hispanic_pct",
    "median_income", "impoverished_pct", "median_age", "renting_pct", "inflation", "isMidterm",
    "genballot_predicted_margin", "genballot_predicted_lower", "genballot_predicted_upper",
    'receipts_DEM', 'receipts_REP', 'disbursements_DEM', 'disbursements_REP', 
    'average_genballot', 'genballot_individual_predicted_margin', 'genballot_campaign5_predicted_margin', 
    'genballot_campaign10_predicted_margin', 'genballot_campaign15_predicted_margin', 
    'average_genballot_predicted_margin', 'finance_fundamental_agree'
]

def optima_model(model, param_dict, data, **kwargs):
    """Performs hyperparameter optimization for a a given model, keeping track of loss. 
    ## Parameters:
    model: sklearnable model, like XGBoost or Linreg
    param_dict: dictionary of hyperparameters to optimize
    X: DataFrame with features
    y: Series with target variable"""

    train, _ = data.loc[data['year'] < 2022], data.loc[data['year'] == 2022]

    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('num', 'passthrough', cont_fts_no_reverse_causality)])
    
    
    def objective(params):
        "Function that takes in hyperparameters and returns loss, that Hyperopt will minimize."        
        testing_loss = []
        accuracies = []
        for train_idx, test_idx in cv.split(train):
            X_train = train.iloc[train_idx].drop(columns = ['margin'])
            y_train = train.iloc[train_idx]['margin']
            X_test = train.iloc[test_idx].drop(columns = ['margin'])
            y_test = train.iloc[test_idx]['margin']
                   
            reg = model(**params)
            pipe = Pipeline(steps = [
                ('preprocessing', preprocessor), 
                ('model', reg)])
                                    
            """Goes through each fold and calculates loss."""
            pipe.fit(X_train, y_train)
            
            predictions = pipe.predict(X_test)
            testing_loss.append(penalize_wrong(y_test, predictions))
            accuracies.append(accuracy_score(np.sign(y_test), np.sign(predictions)))
            
        return {'loss': np.mean(testing_loss), 'status': STATUS_OK}


    "Hyperopt uses the TPE algorithm to optimize hyperparameters. We use the no_progress_loss function to stop early if we don't see progress."
    best_params = fmin(fn=objective,
                    space=param_dict,
                    algo=tpe.suggest,
                    trials=Trials(),
                    early_stop_fn=no_progress_loss(40))
                    
    model = model(**best_params, **kwargs)
    pipe = Pipeline(steps = [
        ('preprocessing', preprocessor), 
        ('model', model)])
    
    #Training final model on data prior to and including 2022, so we get the full extent of the data!
    X, y = data.loc[data['year'] <= 2022, :].drop(columns = ['margin']), data.loc[data['year'] <= 2022, :]['margin']
    
    pipe.fit(X, y)
    
    return pipe

In [3]:
data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('num', 'passthrough', cont_fts_no_reverse_causality)])

names_for_monotonicity = preprocessor.fit(data.drop(columns=['margin'])).get_feature_names_out()

positive_monotonic = ['incumbent_differential', 'receipts', 'disbursements', 'disbursements_DEM', 'receipts_DEM', 'individual_contributions_DEM',
                                       'genballot_predicted_margin', 'specials_predicted_margin', 'genballot_individual_predicted_margin', 
                                       'genballot_campaign5_predicted_margin', 'genballot_campaign10_predicted_margin', 'genballot_campaign15_predicted_margin',
                                       'average_genballot_predicted_margin']

negative_monotonic = ['disbursements_REP', 'receipts_REP', 'individual_contributions_REP']

positive_monotonic = ['num__' + name for name in positive_monotonic]
negative_monotonic = ['num__' + name for name in negative_monotonic]

monotone_constraints = [1 if name in positive_monotonic else -1 if name in negative_monotonic else 0 for name in names_for_monotonicity]

# Define the search space for Hyperopt
param_dist_lgbm = {
    'boosting_type': 'dart',
    'num_leaves': hp.randint('num_leaves', 20, 70),  # Reduced the upper limit, 
    'n_estimators': hp.randint('n_estimators', 50, 200),  # Increased the range
    'learning_rate': hp.loguniform('learning_rate', -5, -2),  # Equivalent to about 0.0001 to 0.01
    'subsample_for_bin': hp.randint('subsample_for_bin', 20000, 200000),  # Narrowed the range
    'min_data_in_bin': hp.randint('min_data_in_bin', 1, 10), 
    'min_data_in_leaf': hp.randint('min_data_in_leaf', 1, 10),  # Reduced the upper limit
    'min_child_samples': hp.randint('min_child_samples', 20, 150),  # Increased the range for more regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.5),  # Increased upper limit for L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.5),  # Increased upper limit for L2 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),  # Reduced the upper limit
    'subsample': hp.uniform('subsample', 0.5, 0.8),  # Reduced the upper limit for more randomness
    'max_depth': hp.randint('max_depth', 2, 10),  # Added max_depth for additional control
    'drop_rate': hp.uniform('drop_rate', 0.05, 0.5),  # Added drop_rate for dart
    'skip_drop': hp.uniform('skip_drop', 0.1, 0.9),  # Added skip_drop for dart
    "verbose": -1,  # Keep verbose to -1 to reduce log clutter,  
    'monotone_constraints': monotone_constraints, 
    'n_jobs': 8
}

trained_lgbm = optima_model(lgb.LGBMRegressor, param_dist_lgbm, data,
                                boosting_type = 'dart', monotone_constraints = monotone_constraints, verbosity = -1, 
                                n_jobs = 8)

file_path = f"../models/CampaignFinanceModel.pkl"

# Open a file to write in binary mode????        
with open(file_path, 'wb') as file:
    pkl.dump(trained_lgbm, file)


  0%|          | 115/9223372036854775807 [06:09<8228537655444915:46:08,  3.21s/trial, best loss: 11.395411634275183] 
