In [1]:
import pandas as pd
import numpy as np
import shap
import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, BaseCrossValidator
from sklearn.metrics import mean_absolute_error, mean_squared_error, log_loss, accuracy_score, median_absolute_error, make_scorer
import xgboost
import lightgbm as lgb
from hebo.optimizers.hebo import HEBO
from hebo.optimizers.bo import BO
from hebo.design_space.design_space import DesignSpace

from scipy.stats import loguniform, randint, uniform

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss
from hyperopt.pyll.base import Apply

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence

from missforest.missforest import MissForest

#Creating a custom time series cross-validator
class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 
    


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
np.seterr(divide='ignore', invalid='ignore')

#Categorical features that need to be one-hot encoded    
one_hot_fts = ['office_type']

#Rating is the only ordinal feature
ordinal_fts = ['final_rating']
ordinal_fts_ranking = ['Safe R', 'Likely R', 'Leans R', 'Toss-up', 'Leans D', 'Likely D', 'Safe D']

#Cont features that should be pass-throughed (and later scaled)
cont_fts = ["open_seat", "incumbent_differential", "special", "absenteeexcusereq", "pollhours", "avgpollhours", "minpollhours",
    "regdeadlines", "voteridlaws", "novoterid", "noallmailvote", "noearlyvote", "nofelonreg",
    "nofelonsregafterincar", "nonstrictid", "nonstrictphoto", "nopollplacereg", "nopr", "nosamedayreg",
    "nostateholiday", "pr16", "pr17", "pr175", "pr60", "pr90", "strictid", "strictphoto", "covi_num",
    "prev_dem_gen_tp", "prev_gen_margin", "weighted_genpoll", "weighted_genpoll_lower",
    "weighted_genpoll_upper", "unweighted_genpoll", 
    "house_chamber_margin", "senate_chamber_margin", "previous_cci", "current_cci", "change_cci",
    "previous_gas", "current_gas", "change_gas", "previous_unemployment", "current_unemployment",
    "change_unemployment", "receipts_DEM", "receipts_REP", "from_committee_transfers_DEM",
    "from_committee_transfers_REP", "disbursements_DEM", "disbursements_REP", "to_committee_transfers_DEM",
    "to_committee_transfers_REP", "beginning_cash_DEM", "beginning_cash_REP", "ending_cash_DEM",
    "ending_cash_REP", "candidate_contributions_DEM", "candidate_contributions_REP", "individual_contributions_DEM",
    "individual_contributions_REP", "receipts", "from_committee_transfers", "disbursements",
    "to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
    "individual_contributions", "unconvinced_pct", "phone_unweighted", "online_unweighted", "num_polls",
    "unweighted_estimate", "unweighted_ci_lower", "unweighted_ci_upper", "weighted_estimate",
    "weighted_ci_lower", "weighted_ci_upper", "white_pct", "black_pct", "asian_pct", "hispanic_pct",
    "median_income", "impoverished_pct", "median_age", "renting_pct", "inflation", "isMidterm",
    "genballot_predicted_margin", "genballot_predicted_lower", "genballot_predicted_upper",
    "specials_predicted_margin", "receipts_genballot_interaction", "disbursements_genballot_interaction"]


def optima_model(model, param_dict, X, y, penalizing_factor):
    """Performs hyperparameter optimization for a a given model, keeping track of loss. 
    ## Parameters:
    model: sklearnable model, like XGBoost or Linreg
    param_dict: dictionary of hyperparameters to optimize
    X: DataFrame with features
    y: Series with target variable"""
    
    def penalize_scorer(y, y_pred):
        score = penalize_wrong(y, y_pred, penalizing_factor)
        return ("penalize_scorer", score, False)
    
    def penalize_wrong(y, y_pred, base_penalty):
        penalty = base_penalty * abs(y)
        return np.mean((y_pred - y)**2 * (1 + penalty * (np.sign(y_pred) != np.sign(y))))

    
    X_train, X_test, y_train, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])
    

    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])
    
    wrong_results = dict()
    all_shap_values = []
    expected_values = []

    def objective(params):
        "Function that takes in hyperparameters and returns loss, that Hyperopt will minimize."        
        training_loss = []
        testing_loss = []
        accuracies = []
        test_start_year = 2002
        for train_idx, test_idx in cv.split(X_train):
            test_start_year += 4
            
            reg = model(**params)
            pipe = Pipeline(steps = [
                ('preprocessing', preprocessor), 
                ('model', reg)])
            
            pipe.named_steps['preprocessing'].fit(X_train.iloc[train_idx])
            transformed_val = pipe.named_steps['preprocessing'].transform(X_train.iloc[test_idx])
            early_stopping = lgb.early_stopping(10, verbose = False)
            
            """Goes through each fold and calculates loss.
            Note: We use median absolute error because it is more robust to outliers than mean absolute error.
            We also expect earlier folds to have higher error, since they have less data to train on."""
            pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx],
                     model__eval_set = [(transformed_val, y_train.iloc[test_idx])], model__eval_metric = penalize_scorer, 
                     model__callbacks = [early_stopping])            
            
            predictions = pipe.predict(X_train.iloc[test_idx])
            training_preds = pipe.predict(X_train.iloc[train_idx])
            testing_loss.append(penalize_wrong(y_train.iloc[test_idx], predictions, penalizing_factor))
            training_loss.append(penalize_wrong(y_train.iloc[train_idx], training_preds, penalizing_factor))
            not_equals = np.not_equal(np.sign(y_train.iloc[test_idx]), np.sign(predictions))
            accuracies.append(accuracy_score(np.sign(y_train.iloc[test_idx]), np.sign(predictions)))
            if test_start_year == 2018:
                preprocessed_X_test = pipe.named_steps['preprocessing'].transform(X_train.iloc[test_idx])
                #scaled_X_test = pipe.named_steps['scaling'].transform(preprocessed_X_test)
                
                "Change this to TreeExplainer if you are using a tree-based model"
                Explainer = shap.TreeExplainer(pipe.named_steps['model'])
                shap_values = Explainer.shap_values(preprocessed_X_test)
                all_shap_values.append(shap_values)
                expected_values.append(Explainer.expected_value)
                for idx, not_equal in enumerate(not_equals):
                    if not_equal:
                        if idx in wrong_results:
                            wrong_results[idx] += 1
                        else:
                            wrong_results[idx] = 1
            
        print(f"Training loss: {training_loss}, Mean: {np.mean(training_loss)}")
        print(f"Validation loss: {testing_loss}, mean: {np.mean(testing_loss)}")
        print(f"Mean Accuracy: {np.mean(accuracies)}")
        print(f"Accuracies: {accuracies}")
        print(f"Overfitting differential: {np.mean(testing_loss) - np.mean(training_loss)}")
        print(f"Parameters: {params}")
        return {'loss': np.mean(testing_loss), 'status': STATUS_OK}


    start_time = time.time()
    max_time = 120
    def stop(trial, elapsed_time=0):
        return elapsed_time > max_time, [time.time() - start_time] 
    
    "Hyperopt uses the TPE algorithm to optimize hyperparameters. We use the no_progress_loss function to stop early if we don't see progress."
    best_params = fmin(fn=objective,
                    space=param_dict,
                    algo=tpe.suggest,
                    trials=Trials(),
                    early_stop_fn=stop)
                    
    print("Best parameters:", best_params)
    model = model(**best_params)
    pipe = Pipeline(steps = [
        ('preprocessing', preprocessor), 
        ('model', model)])
    
    #Currently looking at 2020 wrong results
    pipe.fit(X.loc[X['year'] < 2020], y.loc[X['year'] < 2020])
    preds = pipe.predict(X.loc[X['year'] == 2020])
    wrong_indices = np.not_equal(np.sign(y.loc[X['year'] == 2020]), np.sign(preds))
    wrong_results_2020 = X.loc[X['year'] == 2020].loc[wrong_indices]
    
    print(f"Final loss w/penalty: {penalize_wrong(y_test, pipe.predict(X_test), penalizing_factor)}")
    print(f"Final MAE: {mean_absolute_error(y_test, pipe.predict(X_test))}")
    print(f"Final accuracy: {accuracy_score(np.sign(y_test), np.sign(pipe.predict(X_test)))}")
    
    print(f"Wrong results_2020, {wrong_results_2020}")
    
    #Also, 2022 wrong results
    pipe.fit(X.loc[X['year'] < 2022], y.loc[X['year'] < 2022])
    preds = pipe.predict(X.loc[X['year'] == 2022])
    wrong_indices = np.not_equal(np.sign(y.loc[X['year'] == 2022]), np.sign(preds))
    wrong_results_2022 = X.loc[X['year'] == 2022].loc[wrong_indices]
    
    print(f"Final loss w/penalty: {penalize_wrong(y_test, pipe.predict(X_test), penalizing_factor)}")
    print(f"Final MAE: {mean_absolute_error(y_test, pipe.predict(X_test))}")
    print(f"Final accuracy: {accuracy_score(np.sign(y_test), np.sign(pipe.predict(X_test)))}")
    
    print(f"Wrong results_2020, {wrong_results_2022}")
    
    shap_values = np.mean(all_shap_values, axis = 0)
    shap_df = pd.DataFrame(shap_values, columns = pipe.named_steps['preprocessing'].get_feature_names_out())
    
    return shap_df, np.mean(expected_values), wrong_results_2020, preprocessor



In [3]:
import re

data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X = data.drop(columns=['margin'])
y = data['margin']

preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts),
        ('ord', OrdinalEncoder(categories = [ordinal_fts_ranking], handle_unknown='use_encoded_value', 
                               unknown_value=np.nan), ordinal_fts),
        ('num', 'passthrough', cont_fts)])

names_for_monotonicity = preprocessor.fit(X).get_feature_names_out()
before_processing_monotonic_columns = ['incumbent_differential', 'pvi', "receipts", "from_committee_transfers", "disbursements",
"to_committee_transfers", "beginning_cash", "ending_cash", "candidate_contributions",
"individual_contributions", 'genballot_predicted_margin', 'specials_predicted_margin', 'unweighted_estimate', 'unweighted_ci_lower',
'unweighted_ci_upper','weighted_estimate', 'weighted_ci_lower', 'weighted_ci_upper',
'phone_unweighted', 'online_unweighted', 'receipts_genballot_interaction',
'disbursements_genballot_interaction', 'poll_fundamental_average', 'genballot_predicted_lower', 
'genballot_predicted_upper']

monotonic_columns = ['num__' + name for name in before_processing_monotonic_columns] + ['ord__final_rating']
print(monotonic_columns)

monotone_constraints = [1 if name in monotonic_columns else 0 for name in names_for_monotonicity]

# Define the search space for Hyperopt
param_dist_lgbm = {
    'boosting_type': 'dart',  # Removed 'goss' to simplify
    'num_leaves': hp.randint('num_leaves', 20, 70),  # Reduced the upper limit, 
    'learning_rate': hp.loguniform('learning_rate', -6, -1),  # Equivalent to about 0.0001 to 0.01
    'subsample_for_bin': hp.randint('subsample_for_bin', 20000, 200000),  # Narrowed the range
    'min_data_in_bin': hp.randint('min_data_in_bin', 1, 10), 
    'min_data_in_leaf': hp.randint('min_data_in_leaf', 1, 10),  # Reduced the upper limit
    'min_child_samples': hp.randint('min_child_samples', 20, 150),  # Increased the range for more regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 5),  # Increased upper limit for L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 5),  # Increased upper limit for L2 regularization
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 0.8),  # Reduced the upper limit
    'subsample': hp.uniform('subsample', 0.2, 0.8),  # Reduced the upper limit for more randomness
    'max_depth': hp.randint('max_depth', 2, 15),  # Added max_depth for additional control
    "verbose": -1,  # Keep verbose to -1 to reduce log clutter, 
    'monotone_constraints': monotone_constraints
}

shap_values, expected_value, wrong_results, preprocessor = optima_model(lgb.LGBMRegressor, param_dist_lgbm, X, y, penalizing_factor = 2)

wrong_results.to_csv("wrong_predictions.csv", index = True)

['num__incumbent_differential', 'num__pvi', 'num__receipts', 'num__from_committee_transfers', 'num__disbursements', 'num__to_committee_transfers', 'num__beginning_cash', 'num__ending_cash', 'num__candidate_contributions', 'num__individual_contributions', 'num__genballot_predicted_margin', 'num__specials_predicted_margin', 'num__unweighted_estimate', 'num__unweighted_ci_lower', 'num__unweighted_ci_upper', 'num__weighted_estimate', 'num__weighted_ci_lower', 'num__weighted_ci_upper', 'num__phone_unweighted', 'num__online_unweighted', 'num__receipts_genballot_interaction', 'num__disbursements_genballot_interaction', 'num__poll_fundamental_average', 'num__genballot_predicted_lower', 'num__genballot_predicted_upper', 'ord__final_rating']
Training loss: [1340.7983276848195, 1152.28174394262, 1347.7783641336175], Mean: 1280.2861452536856
Validation loss: [885.3220935634843, 790.5847142580927, 849.3255020989603], mean: 841.7441033068458
Mean Accuracy: 0.9291955454243778                      
Ac

ValueError: DataFrame constructor not properly called!

In [None]:
indices = [k for k, v in sorted(wrong_results.items(), key=lambda item: -1*item[1]) if v >= max(wrong_results.values())*0.75]

# Reset index after filtering to align indices
training_df = X[X['year'] < 2018].reset_index(drop=True)
validation_df = X[X['year'].isin([2018, 2020])].reset_index(drop=True)
y_validation = y[X['year'].isin([2018, 2020])].reset_index(drop=True)

# Process the training and validation data
preprocessed_training = preprocessor.fit_transform(training_df)
#scaler = StandardScaler().fit(preprocessed_training)
preprocessed_val = preprocessor.transform(validation_df)

# Assuming 'indices' are calculated correctly and correspond to 'validation_df'
# 'wrong_predictions' DataFrame with margins for wrong predictions
wrong_predictions = validation_df.iloc[indices].copy()
wrong_predictions['margin'] = y_validation.iloc[indices]

wrong_predictions.to_csv("wrong_predictions.csv", index = True)

shap.initjs()
# Example: visualizing the fourth wrong prediction's explanation
rank = 790 # Note: 'rank' should be less than the length of 'indices'
wrong_index = 820 #Get correct index from sorted wrong results

# Fetch the corresponding SHAP values and expected value

shap_values_for_wrong = np.array(shap_values.iloc[wrong_index])
print(f"Wrong index: {wrong_index}")
print(f"Actual result: {wrong_predictions['margin'].loc[wrong_index]}")
print(f"Data on race: \n {wrong_predictions.loc[wrong_index]}")

# Generating the force plot for the selected wrong prediction
shap.force_plot(expected_value, shap_values_for_wrong, features=preprocessed_val[wrong_index], feature_names=preprocessor.get_feature_names_out())


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")
data.columns

Index(['Unnamed: 0', 'year', 'state', 'district', 'office_type', 'open_seat',
       'incumbent_differential', 'margin', 'special', 'absenteeexcusereq',
       ...
       'genballot_predicted_margin', 'genballot_predicted_lower',
       'genballot_predicted_upper', 'specials_predicted_margin',
       'receipts_genballot_interaction', 'disbursements_genballot_interaction',
       'democrat_in_presidency', 'gas_democrat_interaction',
       'cci_democrat_interaction', 'poll_fundamental_agree'],
      dtype='object', length=109)