In [15]:
import pandas as pd
import numpy as np
#import shap
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, BaseCrossValidator
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, accuracy_score, median_absolute_error, make_scorer
import xgboost
from scipy.stats import loguniform, randint, uniform

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.early_stop import no_progress_loss
from hyperopt.pyll.base import Apply

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence

from missforest.missforest import MissForest

#Creating a custom time series cross-validator
class CustomTimeSeriesCV(BaseCrossValidator):
    """Creates an iterator that contains the indices from each dataset based on the years given"""
    def __init__(self, years):
        self.years = years

    def split(self, X, y=None, groups=None):
        for train_years, test_years in self.years:
            train_indices = np.where(X['year'].isin(train_years))[0]
            test_indices = np.where(X['year'].isin(test_years))[0]
            yield train_indices, test_indices
        
    def get_n_splits(self, X=None, y=None, groups=None):
        return len(self.years)

class MissForestImputer(BaseEstimator, TransformerMixin):
    def __init__(self, max_iter = 100):
        self.model = MissForest(max_iter = max_iter)
    
    def get_params(self, deep: bool = False) -> dict:
        return {'max_iter': self.model.max_iter}
    
    def fit(self, X, y=None):
        self.model.fit(X)
        return self  # Return self to enable chaining

    def transform(self, X):
        # MissForest's transform method is actually predict in most cases
        X_imputed = self.model.transform(X)
        return X_imputed

    def fit_transform(self, X, y=None, **fit_params):
        return self.model.fit_transform(X)
    

In [16]:
def optima_model(model, param_dict, X, y, use_missing = True, impute = False):
    
    X_train, X_test, y_train, y_test = (X.loc[X['year'] < 2022, :], X.loc[X['year'] == 2022, :], 
                                        y.loc[X['year'] < 2022], y.loc[X['year'] == 2022])
    

    # Create fold structure so we can make a custom cross-validation for time-series
    folds = [
        (range(2002, 2006, 2), [2006, 2008]),
        (range(2002, 2010, 2), [2010, 2012]),
        (range(2002, 2014, 2), [2014, 2016]),
        (range(2002, 2018, 2), [2018, 2020])
    ]

    cv = CustomTimeSeriesCV(folds)
        
    #Categorical features that need to be one-hot encoded    
    one_hot_fts = ['office_type', 'open_seat', 'special', 'isMidterm']
    
    #These features we use regardless of whether or not we're using XGBoost or another model that can natively handle missing values
    pass_fts_no_missing = ['incumbent_differential', 'prev_gen_margin', 'prev2_gen_margin',
    'prev_dem_gen_tp', 'mean_specials_differential', 'pvi',
    'house_chamber_margin', 'senate_chamber_margin', 'previous_cci',
    'current_cci', 'change_cci', 'previous_gas', 'current_gas',
    'change_gas', 'previous_unemployment', 'current_unemployment',
    'change_unemployment', 'white_pct',
    'black_pct', 'asian_pct', 'impoverished_pct', 'median_age', 'renting_pct', 'inflation']
    
    missing_fts = ['receipts_DEM', 'receipts_REP',
       'disbursements_DEM', 'disbursements_REP', 'unconvinced_pct',
       'valid_weighted_ba', 'phone_unweighted_ba', 'online_unweighted_ba',
       'all_unweighted_ba', 'all_unweighted', 'num_polls']
    
    #If we are able to use the missing values, we add them to the list of features
    if use_missing:
        pass_fts = pass_fts_no_missing + missing_fts
    else:
        pass_fts = pass_fts_no_missing 
        
    #These features have <1% missing so we impute (all missing is from DC data or one race in PA) 
    simple_impute_features = [
        "absenteeexcusereq", "pollhours", "avgpollhours",
        "minpollhours", "regdeadlines", "voteridlaws", "novoterid",
        "noallmailvote", "noearlyvote", "nofelonreg", "nofelonsregafterincar",
        "nonstrictid", "nonstrictphoto", "nopollplacereg", "nopr",
        "nosamedayreg", "nostateholiday", "pr16", "pr17",
        "pr175", "pr60", "pr90", "strictid",
        "strictphoto", "covi_num", "median_income"
    ]     
    
    #Combine all the features
    if impute: 
        preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts), 
        ('simple_impute', SimpleImputer(strategy='median'), simple_impute_features),
        ('iterative_impute', MissForestImputer(), missing_fts),
        ('num', 'passthrough', pass_fts_no_missing)
        ])
    else:
        preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), one_hot_fts), 
        ('impute', SimpleImputer(strategy='median'), simple_impute_features),
        ('num', 'passthrough', pass_fts)
        ])
        
    
    def objective(params):
        reg = model(**params)
        pipe = Pipeline(steps = [
            ('preprocessing', preprocessor), 
            ('scaling', StandardScaler()), 
            ('model', reg)])
        
        accuracies = []
        for train_idx, test_idx in cv.split(X_train):
            pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
            print(pipe['iterative_impute'].get_params())
            
            predictions = pipe.predict(X_train.iloc[test_idx])
            mae = median_absolute_error(y_train.iloc[test_idx], predictions)
            accuracies.append(accuracy_score(y_train.iloc[test_idx], predictions))
        print(accuracies)
        return {'loss': -1*np.mean(accuracies), 'status': STATUS_OK}


    trials = Trials()
    best_params = fmin(fn=objective,
                    space=param_dict,
                    algo=tpe.suggest,
                    trials=trials,
                    early_stop_fn=no_progress_loss(iteration_stop_count=20))

    print("Best parameters:", best_params)
    acc_model = model(**best_params)
    pipe = Pipeline(steps = [
            ('preprocessing', preprocessor), 
            ('scaling', StandardScaler()), 
            ('model', acc_model)])
    pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    print(f"Accuracy: {np.mean(np.sign(y_test) == np.sign(predictions))}")
    return None


In [17]:
data = pd.read_csv("../cleaned_data/Engineered Dataset.csv")

X = data.drop(columns=['margin'])
y = data['margin']
y = y.where(y > 0, 0)
y = y.where(y <= 0, 1)


param_dist_xgb = {
    'n_estimators': hp.randint('n_estimators', 5, 300),
    'max_depth': hp.randint('max_depth', 2, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0., 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'min_child_weight': hp.randint('min_child_weight', 1, 20),
    'gamma': hp.loguniform('gamma', np.log(5), np.log(20)),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(100)),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(100))
}


xgb = xgboost.XGBClassifier
#optima_model(xgb, param_dist_xgb, X, y, True)


param_dist_svc = {
    'C': hp.loguniform('C', -5, 5),
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'degree': hp.randint('degree', 1, 10),
    'gamma': hp.choice('gamma', ['scale', 'auto'])
}

svc = SVC
optima_model(svc, param_dist_svc, X, y, True, True)

param_dist_logistic = {
    'C': hp.loguniform('C', -5, 5),
    'penalty': 'elasticnet',
    'solver': 'saga',
    'l1_ratio': hp.uniform('l1_ratio', 0, 1),
    'max_iter': 100000,
    'verbose': 0
}

logistic = LogisticRegression
#optima_model(logistic, param_dist_logistic, X, y, False, True)

[LightGBM] [Info] Total Bins 0                                         
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 1220350.875000             
[LightGBM] [Info] Total Bins 0                                         
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 3859955.750000             
[LightGBM] [Info] Total Bins 0                                         
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 1217813.500000             
[LightGBM] [Info] Total Bins 0                                         
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Start training from score 4718851.500000             
[LightGBM] [Info] Total Bins 0                                         


In [14]:
SimpleImputer().get_params()
IterativeImputer().get_params()
MissForest(n_estimators = 100)

TypeError: MissForest.__init__() got an unexpected keyword argument 'n_estimators'