In [114]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from scipy.stats import loguniform, randint, uniform
import pickle as pkl
from sklearn.model_selection import BaseCrossValidator

pd.options.mode.chained_assignment = None 

class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['cycle'].isin(train_years))[0]
            test_indices = np.where(X['cycle'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years) 

In [115]:
past_polls = pd.read_csv('../../data/raw_polls.csv')
days_to_rate = 50

office_type_dict = {
    "Pres-G": "President",
    "Sen-G": "Senate",
    "Gov-G": "Governor",
    "House-G": "House"    
}

#This filters out rows we do not want
past_polls = past_polls.query("time_to_election <= @days_to_rate & not @pd.isna(methodology)") #Filtering out rows with no methodology
past_polls = past_polls[(past_polls['cand1_party'] == "DEM") & (past_polls['cand2_party'] == "REP") & (past_polls['location'] != "US")]
past_polls = past_polls[past_polls['type_simple'].isin(["Pres-G", "Sen-G", "Gov-G", "House-G"])]

#Adding important columns, X and Y
past_polls['office_type'] = past_polls['type_simple'].map(office_type_dict)
past_polls['bias'] = past_polls['margin_poll'] - past_polls['margin_actual']
past_polls['squared_error'] = np.square(past_polls['bias']) 

past_polls = past_polls[['cycle', 'office_type', 'pollster_rating_id', 'time_to_election', 'methodology', 
                                     'partisan', 'samplesize', 'margin_poll', 'bias', 'squared_error']]

#Polls often have multiple methodologies, so we will split them into multiple columns
unique_methods = set()
for methods in past_polls['methodology']:
    unique_methods.update(methods.split('/'))

for method in unique_methods:
    past_polls[method] = past_polls['methodology'].apply(lambda x: 1 if method in x.split('/') else 0)

#Removes methodology column, as it is no longer needed
past_polls = past_polls.drop(columns=['methodology'])

In [116]:
def make_pollster_rating_model(poll_df, before_year, model, param_dist):
    """This creates and saves two models for polls before a given year: one to predict error, one to predict bias.
    It is very important that we do NOT include polls from the before year in the data
    The before year is the year that we will eventually use to run the model on -- our current model is based on 2024"""
    poll_df = poll_df[poll_df['cycle'] < before_year]
    X = poll_df.drop(columns=['bias', 'squared_error'])
    error = poll_df['squared_error']
    bias = poll_df['bias']
    
    pollster_dummies = OneHotEncoder(sparse_output=False, handle_unknown='ignore', min_frequency=20) #Only choosing pollster with 20 or more previous polls

    preprocessor = ColumnTransformer([('cat1', pollster_dummies, ['pollster_rating_id'], ), 
                                      ('cat2', OneHotEncoder(handle_unknown='ignore'), ['partisan', 'office_type'])], remainder='passthrough')

    min_year = poll_df['cycle'].min() #The minimum year in the dataset, we choose to include polls going back to 1998
    
    folds = [(range(min_year, year, 2), [year]) for year in range(min_year + 2, before_year, 2)] #Dynamically creating folds based on the before year
    cv = CustomTimeSeriesCV(folds)

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    param_dist = {
    'regressor__' + key: value for key, value in param_dist.items()
    }
    
    error_grid = RandomizedSearchCV(model, param_dist, n_iter=50, cv=cv, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1, verbose = 1)
    error_grid.fit(X, error)
    print("Percent predictions of error < 0 are: "+ str(np.mean(error_grid.predict(X) < 0)))
    
    #The error model will have significantly higher MSE than the bias model, as the error model is predicting the squared error
    print(f'Best error model for {before_year}: {model[-1].__class__.__name__} with MSE of {error_grid.best_score_}')
    
    file_path_error = f'../../models/Polls_{before_year}_error.pkl'
    with open(file_path_error, 'wb') as file:
        pkl.dump(error_grid, file)

    bias_grid = RandomizedSearchCV(model, param_dist, n_iter=50, cv=cv, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1, verbose = 1)
    bias_grid.fit(X, bias)
    file_path_bias = f'../../models/Polls_{before_year}_bias.pkl'
    with open(file_path_bias, 'wb') as file:
        pkl.dump(bias_grid, file)
        
    print(f'Best bias model for {before_year}: {model[-1].__class__.__name__} with MSE of {bias_grid.best_score_}')

    

In [117]:
years = range(2002, 2026, 2)

param_dist_svr = {
    'C': loguniform(1e-3, 1e3),  # Regularization parameter
    'degree': randint(2, 6),  # Degree of the polynomial kernel function (only relevant if kernel='poly')
    'gamma': loguniform(1e-4, 1e1),  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'coef0': uniform(0, 1),  # Independent term in kernel function (only relevant for 'poly' and 'sigmoid')
    'tol': loguniform(1e-5, 1e-1),  # Tolerance for stopping criterion
}


for year in years:
    make_pollster_rating_model(past_polls, year, SVR(), param_dist_svr)


Fitting 1 folds for each of 50 candidates, totalling 50 fits
Percent predictions of error < 0 are: 0.0
Best error model for 2002: SVR with MSE of -3939.338061169812
Fitting 1 folds for each of 50 candidates, totalling 50 fits
Best bias model for 2002: SVR with MSE of -45.914608298151165
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Percent predictions of error < 0 are: 0.0
Best error model for 2004: SVR with MSE of -4381.136016250903
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best bias model for 2004: SVR with MSE of -57.64040999892533
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Percent predictions of error < 0 are: 0.0
Best error model for 2006: SVR with MSE of -3765.6988462145614
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best bias model for 2006: SVR with MSE of -49.552334112236395
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Percent predictions of error < 0 are: 0.0
Best error model for 2008: S



Percent predictions of error < 0 are: 0.0
Best error model for 2022: SVR with MSE of -6417.210568822541
Fitting 11 folds for each of 50 candidates, totalling 550 fits




Best bias model for 2022: SVR with MSE of -48.12301416698863
Fitting 12 folds for each of 50 candidates, totalling 600 fits




Percent predictions of error < 0 are: 0.0
Best error model for 2024: SVR with MSE of -6542.50547910147
Fitting 12 folds for each of 50 candidates, totalling 600 fits
Best bias model for 2024: SVR with MSE of -47.965269706871545
