In [91]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
import xgboost
from scipy.stats import loguniform, randint, t, uniform
import pickle as pkl
from sklearn.model_selection import BaseCrossValidator

pd.options.mode.chained_assignment = None 

class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['cycle'].isin(train_years))[0]
            test_indices = np.where(X['cycle'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 


In [92]:
polls_for_rating = pd.read_csv('../../data/raw_polls.csv')
days_to_rate = 21

office_type_dict = {
    "Pres-G": "President",
    "Sen-G": "Senate",
    "Gov-G": "Governor",
    "House-G": "House"    
}

#This filters out rows we do not want
polls_for_rating = polls_for_rating.query("time_to_election <= @days_to_rate & not @pd.isna(methodology)")
polls_for_rating = polls_for_rating[(polls_for_rating['cand1_party'] == "DEM") & (polls_for_rating['cand2_party'] == "REP") & (polls_for_rating['location'] != "US")]
polls_for_rating = polls_for_rating[polls_for_rating['type_simple'].isin(["Pres-G", "Sen-G", "Gov-G", "House-G"])]

#Adding important columns, X and Y
polls_for_rating['office_type'] = polls_for_rating['type_simple'].map(office_type_dict)
polls_for_rating['bias'] = polls_for_rating['margin_poll'] - polls_for_rating['margin_actual']
polls_for_rating['error'] = np.abs(polls_for_rating['bias']) 
polls_for_rating['greater than 20'] = abs(polls_for_rating['margin_poll']) > 20

polls_for_rating = polls_for_rating[['cycle', 'office_type', 'pollster_rating_id', 'aapor_roper', 'methodology', 
                                     'partisan', 'samplesize', 'greater than 20', 'bias', 'error']]

unique_methods = set()
for methods in polls_for_rating['methodology']:
    unique_methods.update(methods.split('/'))

for method in unique_methods:
    polls_for_rating[method] = polls_for_rating['methodology'].apply(lambda x: 1 if method in x.split('/') else 0)

polls_for_rating = polls_for_rating.drop(columns=['methodology'])


In [93]:
def make_pollster_rating_model(poll_df, before_year, model, param_dist):
    """This creates and saves two models for polls before a given year: one to predict error, one to predict bias.
    It is very important that we do NOT include polls from the before year in the data"""
    poll_df = poll_df[poll_df['cycle'] < before_year]
    X = poll_df.drop(columns=['bias', 'error'])
    error = poll_df['error']
    bias = poll_df['bias']
    
    dummy_creator = OneHotEncoder(sparse_output=False, handle_unknown='ignore', min_frequency=20) #Only choosing pollster with 20 or more previous polls

    preprocessor = ColumnTransformer([('cat', dummy_creator, ['pollster_rating_id', 'partisan', 'office_type'])], remainder='passthrough')

    min_year = poll_df['cycle'].min()
    
    folds = [(range(min_year, year), [year]) for year in range(min_year + 2, before_year, 2)] #Dynamically creating folds based on the before year
    cv = CustomTimeSeriesCV(folds)

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    param_dist = {
    'regressor__' + key: value for key, value in param_dist.items()
    }
    
    grid = RandomizedSearchCV(model, param_dist, n_iter=50, cv=cv, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1, verbose =1)
    grid.fit(X, error)
    
    print(f'Best error model for {before_year}: {model.__class__.__name__} with MAE of {grid.best_score_}')
    
    file_path_error = f'../../models/Polls_{before_year}_error.pkl'
    with open(file_path_error, 'wb') as file:
        pkl.dump(grid, file)

    grid.fit(X, bias)
    file_path_bias = f'../../models/Polls_{before_year}_bias.pkl'
    with open(file_path_bias, 'wb') as file:
        pkl.dump(grid, file)
        
    print(f'Best bias model for {before_year}: {model.__class__.__name__} with MAE of {grid.best_score_}')

    

In [94]:
param_dist_xgb = {
    'n_estimators': randint(50, 1000),  # Number of boosted trees to fit
    'max_depth': randint(2, 15),  # Maximum tree depth for base learners
    'learning_rate': loguniform(0.01, 0.3),  # Boosting learning rate
    'gamma': loguniform(0.001, 5),  # Minimum loss reduction required to make a further partition
    'min_child_weight': loguniform(0.1, 10),  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': uniform(0.5, 0.5),  # Subsample ratio of the training instances
    'colsample_bytree': uniform(0.5, 0.5),  # Subsample ratio of columns when constructing each tree
    'colsample_bylevel': uniform(0.5, 0.5),  # Subsample ratio of columns for each level
    'colsample_bynode': uniform(0.5, 0.5),  # Subsample ratio of columns for each node (split)
    'reg_alpha': loguniform(0.01, 100),  # L1 regularization term on weights
    'reg_lambda': loguniform(0.01, 100),  # L2 regularization term on weights
    'scale_pos_weight': loguniform(0.1, 10),  # Balancing of positive and negative weights
    'max_delta_step': randint(0, 10),  # Maximum delta step we allow each tree's weight estimation to be
}

years = range(2002, 2026, 2)
for year in years:
    make_pollster_rating_model(polls_for_rating, year, xgboost.XGBRegressor(), param_dist_xgb)



Fitting 1 folds for each of 50 candidates, totalling 50 fits
Best error model for 2002: Pipeline with MAE of -20.722353635181666
Fitting 1 folds for each of 50 candidates, totalling 50 fits
Best bias model for 2002: Pipeline with MAE of -43.857672232069504
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best error model for 2004: Pipeline with MAE of -18.65201850806786
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best bias model for 2004: Pipeline with MAE of -45.481264609014815
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best error model for 2006: Pipeline with MAE of -16.75717514518871
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best bias model for 2006: Pipeline with MAE of -40.30218726308426
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Best error model for 2008: Pipeline with MAE of -16.66048200617618
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Best bias model for 2008: Pipeline wi