# Registration Type Time Series

- In this notebook I will be creating a model that predicts that the number of registration type of registered voters for the State of Colorado and forecasting out roughly a year in time as well.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
import statsmodels.api as sm
from dateutil.relativedelta import relativedelta
import warnings
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

  from pandas.core import datetools


In [2]:
df = pd.read_csv("./Data/registration_totals_and_changes.csv")

In [3]:
df.columns

Index(['Date', 'County', 'Active Dem', 'Active Rep', 'Active Uaf',
       'Total Active', 'Inactive Dem', 'Inactive Rep', 'Inactive Uaf',
       'Total Inactive', 'Grand Total', 'Year', 'Month', 'Label',
       'Active Dem_change', 'Active Rep_change', 'Active Uaf_change',
       'Total Active_change', 'Inactive Dem_change', 'Inactive Rep_change',
       'Inactive Uaf_change', 'Total Inactive_change', 'Grand Total_change'],
      dtype='object')

In [4]:
df = df[['Date', 'County', 'Year', 'Month', 'Label','Active Dem_change', 'Active Rep_change', 'Active Uaf_change']]

In [5]:
df.head()

Unnamed: 0,Date,County,Year,Month,Label,Active Dem_change,Active Rep_change,Active Uaf_change
0,2012-11-28,Yuma,2012,nov,REP,,,
1,2012-11-28,La Plata,2012,nov,DEM,,,
2,2012-11-28,Lake,2012,nov,DEM,,,
3,2012-11-28,Larimer,2012,nov,DEM,,,
4,2012-11-28,Las Animas,2012,nov,Swing,,,


In [8]:
df["Odd_year"] = df["Year"] % 2
df["Odd_year"] = df["Odd_year"].apply(str)
df["Drop_month"] = df["Odd_year"] + df["Month"]
df["Drop_month"] = df["Drop_month"].map(lambda x: 1 if x == "0dec" or x == "1jan" or x == "1feb" else 0)
df["Election_buzz"] = df["Odd_year"] + df["Month"]
df["Election_buzz"] = df["Election_buzz"].map(lambda x: 1 if x == "0aug" or x == "0sept" or x == "0oct" else 0)
df["Before_prime"] = df["Odd_year"] + df["Month"]
df["Before_prime"] = df["Before_prime"].map(lambda x: 1 if x == "0may" or x == "0june" else 0)
df["After_prime"] = df["Odd_year"] + df["Month"]
df["After_prime"] = df["After_prime"].map(lambda x: 1 if x == "0july" else 0)

In [14]:
df.tail()

Unnamed: 0,Date,County,Year,Month,Label,Active Dem_change,Active Rep_change,Active Uaf_change,Odd_year,Drop_month,Election_buzz,Before_prime,After_prime
3835,2017-10-28,Huerfano,2017,oct,DEM,-0.005023,0.003949,0.013879,1,0,0,0,0
3836,2017-10-28,Jackson,2017,oct,REP,-0.009091,-0.001429,0.024631,1,0,0,0,0
3837,2017-10-28,Jefferson,2017,oct,DEM,0.002604,0.000444,0.009299,1,0,0,0,0
3838,2017-10-28,Dolores,2017,oct,REP,-0.002994,0.0,0.021127,1,0,0,0,0
3839,2017-10-28,Alamosa,2017,oct,DEM,0.0,0.001943,0.012032,1,0,0,0,0


Creating a data frame to be able to predict on. 

In [17]:
def get_time_series(df):
    grand_total_ts = pd.DataFrame()
    counties = df["County"].unique()
    for count in counties:
        county = df[df["County"] == count].copy()
        county.set_index("Date", inplace=True)
        constants = county[["County", "Year", "Month", "Label", "Odd_year", "Drop_month", "Election_buzz"]]
        totals = pd.DataFrame(county[['Active Dem_change', 'Active Rep_change', 'Active Uaf_change']])
        totals = totals.rolling(3).mean()
        for i in range(1,4):
            month = pd.DataFrame(totals[['Active Dem_change', 'Active Rep_change', 'Active Uaf_change']].shift(i))
            month.columns = [col + "- " + str(i) for col in month.columns]
            totals = pd.concat([month, totals], axis=1)
        county_ts = pd.concat([constants, totals], axis=1)
        grand_total_ts = pd.concat([grand_total_ts, county_ts])
    
    return grand_total_ts

In [19]:
reg_type_ts = get_time_series(df)

In [20]:
reg_type_ts.dropna(inplace=True)

In [22]:
reg_type_ts.head()

Unnamed: 0_level_0,County,Year,Month,Label,Odd_year,Drop_month,Election_buzz,Active Dem_change- 3,Active Rep_change- 3,Active Uaf_change- 3,Active Dem_change- 2,Active Rep_change- 2,Active Uaf_change- 2,Active Dem_change- 1,Active Rep_change- 1,Active Uaf_change- 1,Active Dem_change,Active Rep_change,Active Uaf_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2013-05-28,Yuma,2013,may,REP,1,0,0,0.004977,0.003519,0.014481,0.002477,0.002875,0.007006,0.002057,0.002994,0.008257,0.002869,0.002489,0.008473
2013-06-28,Yuma,2013,june,REP,1,0,0,0.002477,0.002875,0.007006,0.002057,0.002994,0.008257,0.002869,0.002489,0.008473,0.003683,0.001984,0.007108
2013-07-28,Yuma,2013,july,REP,1,0,0,0.002057,0.002994,0.008257,0.002869,0.002489,0.008473,0.003683,0.001984,0.007108,0.03859,0.029151,0.079626
2013-08-28,Yuma,2013,aug,REP,1,0,0,0.002869,0.002489,0.008473,0.003683,0.001984,0.007108,0.03859,0.029151,0.079626,0.032876,0.026033,0.07683
2013-09-28,Yuma,2013,sept,REP,1,0,0,0.003683,0.001984,0.007108,0.03859,0.029151,0.079626,0.032876,0.026033,0.07683,0.033545,0.025051,0.076922


In [23]:
reg_type_ts.columns

Index(['County', 'Year', 'Month', 'Label', 'Odd_year', 'Drop_month',
       'Election_buzz', 'Active Dem_change- 3', 'Active Rep_change- 3',
       'Active Uaf_change- 3', 'Active Dem_change- 2', 'Active Rep_change- 2',
       'Active Uaf_change- 2', 'Active Dem_change- 1', 'Active Rep_change- 1',
       'Active Uaf_change- 1', 'Active Dem_change', 'Active Rep_change',
       'Active Uaf_change'],
      dtype='object')

In [24]:
X = reg_type_ts[['Odd_year', 'Drop_month','Election_buzz', 'Active Dem_change- 3', 'Active Rep_change- 3',
       'Active Uaf_change- 3', 'Active Dem_change- 2', 'Active Rep_change- 2','Active Uaf_change- 2',
       'Active Dem_change- 1', 'Active Rep_change- 1','Active Uaf_change- 1']]

y = reg_type_ts[['Active Dem_change', 'Active Rep_change','Active Uaf_change']]

In [44]:
mor = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=200), n_jobs=1)

In [45]:
cross_val_score(mor, X, y)

array([0.67710136, 0.67542279, 0.68632218])

In [47]:
pred = cross_val_predict(mor, X,y)

In [50]:
reg_type_ts_pred = pd.concat([reg_type_ts, pd.DataFrame(pred, columns=['Active Dem_change_pred', 'Active Rep_change_pred','Active Uaf_change_pred'], index=reg_type_ts.index)], axis=1)

In [52]:
#reg_type_ts_pred.to_csv("reg_type_ts_pred.csv")

# Forecasting

I will be forecasting out 17 months. Most of my testing and experimenting was done in the Grand Total Time Series notebook. 

In [4]:
warnings.filterwarnings("ignore")

In [3]:
df_f = pd.read_csv("./Data/registration_totals_and_changes.csv")

In [4]:
df_f.columns

Index(['Date', 'County', 'Active Dem', 'Active Rep', 'Active Uaf',
       'Total Active', 'Inactive Dem', 'Inactive Rep', 'Inactive Uaf',
       'Total Inactive', 'Grand Total', 'Year', 'Month', 'Label',
       'Active Dem_change', 'Active Rep_change', 'Active Uaf_change',
       'Total Active_change', 'Inactive Dem_change', 'Inactive Rep_change',
       'Inactive Uaf_change', 'Total Inactive_change', 'Grand Total_change'],
      dtype='object')

In [2]:
def get_reg_type_forecast(file_path): #"./Data/registration_totals_and_changes.csv"
    df_f = pd.read_csv(file_path)
    county_forecasts = pd.DataFrame()
    
    df_f = df_f[['Date', 'County', 'Year', 'Month', 'Label','Active Dem', 'Active Rep', 'Active Uaf']]
    df_f.set_index("Date", inplace=True)
    df_f["Odd_year"] = df_f["Year"] % 2
    df_f["Odd_year"] = df_f["Odd_year"].apply(str)
    df_f["Drop_month"] = df_f["Odd_year"] + df_f["Month"]
    df_f["Drop_month"] = df_f["Drop_month"].map(lambda x: 1 if x == "0dec" or x == "1jan" or x == "1feb" else 0)
    df_f["Election_buzz"] = df_f["Odd_year"] + df_f["Month"]
    df_f["Election_buzz"] = df_f["Election_buzz"].map(lambda x: 1 if x == "0aug" or x == "0sept" or x == "0oct" else 0)
    df_f["Odd_year"] = df_f["Odd_year"].apply(int)
    
    
    for count in df_f["County"].unique():
        county_fv_total = pd.DataFrame()
        county = df_f[df_f["County"] == count]
        exog = county[["Odd_year","Drop_month", "Election_buzz"]]
        exog_p = exog["2013-11-28":"2015-03-28"]
        for reg_type in ['Active Dem', 'Active Rep', 'Active Uaf']:
            county_f = county[reg_type]
#            print(count)
            mod = sm.tsa.statespace.SARIMAX(county_f.values, exog=exog.values, order=(3, 1, 0), seasonal_order=(1, 1, 0, 12),
                                            trend="t", time_varying_regression=True, mle_regression=False, enforce_stationarity=False)
            res = mod.fit()

            start = datetime.strptime("2017-11-28", "%Y-%m-%d")
            date_list = [start + relativedelta(months=x) for x in range(0,17)]
            future = pd.DataFrame(index=date_list)
            future[reg_type +" Forecast"] = res.forecast(17,exog=exog_p)
            county_fv = pd.concat([county_f, future])
            county_fv.rename(columns={0:reg_type}, inplace=True)
            county_fv_total = pd.concat([county_fv_total, county_fv], axis=1)

        county_name = county["County"].unique()
        county_label = county["Label"].unique()
        county_fv_total["County"] = county_name[0]
        county_fv_total["Label"] = county_label[0]
        county_forecasts = pd.concat([county_forecasts, county_fv_total])
         
        
    return county_forecasts


In [5]:
county_reg_type_forecast = get_reg_type_forecast("./Data/registration_totals_and_changes.csv")

In [36]:
#county_reg_type_forecast.to_csv("county_reg_type_forecast.csv")