In [None]:
import pandas as pd
import numpy as np
import pickle as pkl
import lightgbm as lgb
import re
import shap

from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import  make_scorer, mean_pinball_loss

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss

class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 

In [None]:
np.seterr(divide='ignore', invalid='ignore')

#Categorical features that need to be one-hot encoded    
one_hot_fts = ['office_type']

#Rating is the only ordinal feature
ordinal_fts = ['final_rating']
ordinal_fts_ranking = ['Safe R', 'Likely R', 'Leans R', 'Toss-up', 'Leans D', 'Likely D', 'Safe D']

#Cont features that should be pass-throughed (aznd later scaled)
cont_fts = [
    "open_seat", "incumbent_differential", "special", "absenteeexcusereq", "pollhours", "avgpollhours", "minpollhours",
    "regdeadlines", "voteridlaws", "novoterid", "noallmailvote", "noearlyvote", "nofelonreg",
    "nofelonsregafterincar", "nonstrictid", "nonstrictphoto", "nopollplacereg", "nopr", "nosamedayreg",
    "nostateholiday", "pr16", "pr17", "pr175", "pr60", "pr90", "strictid", "strictphoto", "covi_num",
    "prev_dem_gen_tp", "prev_gen_margin", "weighted_genpoll", "weighted_genpoll_lower",
    "weighted_genpoll_upper", "unweighted_genpoll", "mean_specials_differential", 
    "house_chamber_margin", "senate_chamber_margin", "previous_cci", "current_cci", "change_cci",
    "previous_gas", "current_gas", "change_gas", "previous_unemployment", "current_unemployment",
    "change_unemployment",  "receipts", "from_committee_transfers", "disbursements",
    "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
    "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted", "num_polls",
    "unweighted_estimate", "unweighted_ci_lower", "unweighted_ci_upper", "weighted_estimate",
    "weighted_ci_lower", "weighted_ci_upper", "white_pct", "black_pct", "asian_pct", "hispanic_pct",
    "median_income", "impoverished_pct", "median_age", "renting_pct", "inflation", "isMidterm",
    "genballot_predicted_margin", "genballot_predicted_lower", "genballot_predicted_upper",
    "poll_fundamental_agree",  'receipts_DEM', 'receipts_REP', 'disbursements_DEM', 'disbursements_REP'
]

num_quantiles = 3
quantile_list = np.linspace(0.001, 0.999, num_quantiles)

def entropy_loss(y_true, y_pred_quantiles) -> float:
    #Convert quantile_list to a numpy array if it's not already one
    quantile_array = np.array(quantile_list)
    y_true = np.array(y_true)

    # Calculate the absolute differences between y_true reshaped to (-1,1) and y_pred_quantiles
    abs_diffs = np.abs(y_true[:, np.newaxis] - y_pred_quantiles)

    # Find the index of the minimum difference for each prediction
    min_indices = np.argmin(abs_diffs, axis=1)
    

    # Compute the negative log likelihood for the corresponding best quantile indices
    losses = -np.log(quantile_array[min_indices]) - np.log(1 - quantile_array[min_indices])

    # Return the mean of the losses
    return np.mean(losses)**2

def optima_model(model, param_dict, X, y, **kwargs):
    """Performs hyperparameter optimization for a a given bootstrapped X 
    ## Parameters:
    model: sklearnable model. We use LGBMRegressor 
    param_dict: dictionary of hyperparameters to optimize
    X: DataFrame with features
    y: Series with target variable"""
        
    X_other, y_other = X.loc[X['year'] <= 2022, :], y.loc[X['year'] <= 2022]
    
    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2012, 2), [2012, 2014]),
        (range(2002, 2016, 2), [2016, 2018]),
        (range(2002, 2020, 2), [2020, 2022])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    #Preprocessing data: no need to scale data, because we use tree-based models which are monotonic-scale-invariant
    #Because we don't need to scale data, we don't have to include the column transformer in the final saved model
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])
        
    def objective(params):
        "Function that takes in hyperparameters and returns loss, that Hyperopt will minimize."        
        testing_loss = []
        
        for train_idx, test_idx in cv.split(X_other):
            
            quantile_predictions = np.zeros((len(test_idx), num_quantiles))
            
            for idx, quantile in enumerate(quantile_list):
                pinball_scorer = make_scorer(mean_pinball_loss, greater_is_better=False, alpha = quantile)
            
                reg = model(**params, alpha = quantile)
                pipe = Pipeline(steps = [
                    ('preprocessing', preprocessor), 
                    ('model', reg)])
                
                """Goes through each fold and calculates loss."""
                pipe.fit(X_other.iloc[train_idx], y_other.iloc[train_idx], model__eval_metric = pinball_scorer)
                quantile_predictions[:, idx] = pipe.predict(X_other.iloc[test_idx])
            testing_loss.append(entropy_loss(y_other.iloc[test_idx], quantile_predictions))   
                      
        return {'loss': np.mean(testing_loss), 'status': STATUS_OK}
    
    "Hyperopt uses the TPE algorithm to optimize hyperparameters. We use the no_progress_loss function to stop early if we don't see progress."
    best_params = fmin(fn=objective,
                    space=param_dict,
                    algo=tpe.suggest,
                    trials=Trials(),
                    early_stop_fn = no_progress_loss(2))
                    
                    
    print("Best parameters pre-placing:", best_params)
    best_model = model(**best_params, **kwargs)
    
    for quantile in quantile_list:
        
        best_model.set_params(alpha = quantile)
        pinball_scorer = make_scorer(mean_pinball_loss, greater_is_better=False, alpha = quantile)
        pipe = Pipeline(steps = [('preprocessing', preprocessor),
                             ('model', best_model)])
        pipe.fit(X_other, y_other, model__eval_metric = pinball_scorer)
        file_path = f"../modelsv3/Pipe_{round(quantile, 3)}.pkl"

        # Open a file to write in binary mode????        
        with open(file_path, 'wb') as file:
            pkl.dump(pipe, file)
            
    #Returns the final model   
    return best_params

In [None]:
data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])

# Define the search space for Hyperopt
param_dist_lgbm = {
    'boosting_type': 'dart',
    'objective': 'quantile',
    'num_leaves': hp.randint('num_leaves', 20, 70),  # Reduced the upper limit, 
    'n_estimators': hp.randint('n_estimators', 50, 200),  # Increased the range
    'learning_rate': hp.loguniform('learning_rate', -5, -2),  # Equivalent to about 0.0001 to 0.01
    'subsample_for_bin': hp.randint('subsample_for_bin', 20000, 200000),  # Narrowed the range
    'min_data_in_bin': hp.randint('min_data_in_bin', 1, 10), 
    'min_data_in_leaf': hp.randint('min_data_in_leaf', 1, 10),  # Reduced the upper limit
    'min_child_samples': hp.randint('min_child_samples', 20, 150),  # Increased the range for more regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.5),  # Increased upper limit for L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.5),  # Increased upper limit for L2 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),  # Reduced the upper limit
    'subsample': hp.uniform('subsample', 0.5, 0.8),  # Reduced the upper limit for more randomness
    'max_depth': hp.randint('max_depth', 2, 10),  # Added max_depth for additional control
    "verbose": -1,  # Keep verbose to -1 to reduce log clutter, 
}

best_params = optima_model(lgb.LGBMRegressor, param_dist_lgbm, data.drop(columns=['margin']), data['margin'], 
                            boosting_type = 'dart', verbosity = -1, 
                            objective = 'quantile')

with open("../modelsv3/best_params_in_general.pkl", "wb") as f:
    pkl.dump(best_params, f)

  0%|          | 3/9223372036854775807 [00:20<17505306707854429:17:52,  6.83s/trial, best loss: 3.456322873813134]
Best parameters pre-placing: {'colsample_bytree': 0.7471697111374278, 'learning_rate': 0.0249855099331071, 'max_depth': 3, 'min_child_samples': 87, 'min_data_in_bin': 4, 'min_data_in_leaf': 8, 'n_estimators': 139, 'num_leaves': 21, 'reg_alpha': 0.09366183305548997, 'reg_lambda': 0.6116565752689745, 'subsample': 0.777189520958069, 'subsample_for_bin': 138426}


In [None]:
with open("../modelsv3/Pipe_0.5.pkl", "rb") as f:
    pipe1 = pkl.load(f)
    
X_2022 = data.loc[data['year'] == 2022].drop(columns = ['margin'])
contributions = pipe1.predict(X_2022, pred_contrib = True)
feature_names = pipe1.named_steps['preprocessing'].get_feature_names_out()

contributions_df = pd.DataFrame(contributions, columns = np.append(feature_names, 'expected_value'))
print(contributions_df)

     cat__office_type_Governor  cat__office_type_House  \
0                          0.0                     0.0   
1                          0.0                     0.0   
2                          0.0                     0.0   
3                          0.0                     0.0   
4                          0.0                     0.0   
..                         ...                     ...   
457                        0.0                     0.0   
458                        0.0                     0.0   
459                        0.0                     0.0   
460                        0.0                     0.0   
461                        0.0                     0.0   

     cat__office_type_President  cat__office_type_Senate  ord__final_rating  \
0                           0.0                      0.0          -1.476542   
1                           0.0                      0.0          -1.484412   
2                           0.0                      0.0          

In [None]:
re.sub('^.*?__', '', 'ord__descri_ber')

'descri_ber'