# Federal Polling Average

This is the polling averaging model behind the [cdnpo.li](https://cdnpo.li) website.

### Data Processing
The data for this is open source, on GitHub. However, some pre-processing is required. First, we need to convert the ratio of new polling data. This is done through `apply_fraction`. Note that this should probably be replaced at some point by a method to only record the most recent of previous polls and otherwise dropping their weights to `0.0`

In [2]:
import pandas as pd
import locale
import re
import numpy as np

# "1,000 (1/4)" = 250
def apply_fraction(s):
    if isinstance(s, int):
        return s
    if isinstance(s, float):
        if np.isnan(s):
            return 100
        return int(s)
    split = s.split()
    sample = int(split[0].replace(',', ''))
    regex = r"\/\d"
    if len(split) > 1:
        sample /= int(re.findall(regex, split[1])[0].replace("/", ""))
    return int(sample)

def parse_margin(s):
    if isinstance(s, float):
        return s
    trimmed = s.replace('±', '')
    split = trimmed.split()
    try:
        return float(split[0])
    except ValueError:
        return np.nan

def sort_margin_fraction(df):
    df = df.replace('-', np.nan)
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df['Margin'] = df['Margin'].apply(lambda x: parse_margin(x))
    df['Sample'] = df['Sample'].apply(lambda s: apply_fraction(s))
    df.sort_values('Date', inplace=True)
    df.index = df['Date']
    return df

#df2008 = sort_margin_fraction(pd.read_csv("https://raw.githubusercontent.com/ErikPartridge/canadian-election-data/master/federal/2008%20Federal%20Polling.csv"))
#df2011 = sort_margin_fraction(pd.read_csv("https://raw.githubusercontent.com/ErikPartridge/canadian-election-data/master/federal/2011%20Federal%20Polling.csv"))
df2015 = sort_margin_fraction(pd.read_csv("https://raw.githubusercontent.com/ErikPartridge/canadian-election-data/master/federal/2015%20Federal%20Polling.csv"))
df2019 = sort_margin_fraction(pd.read_csv("https://raw.githubusercontent.com/ErikPartridge/canadian-election-data/master/federal/2019%20Federal%20Polling.csv"))
df_results = pd.read_csv("https://raw.githubusercontent.com/ErikPartridge/canadian-election-data/master/federal/2008-2015%20Federal%20Results.csv")
df_combined = sort_margin_fraction(pd.read_csv("https://raw.githubusercontent.com/ErikPartridge/canadian-election-data/master/federal/2008-2019%20Federal%20Polling.csv"))
df2019['Date'] = df2019["Date"].apply(lambda x: x.strftime('%Y-%m-%d'))


### Approach
We use the expontential weighted moving average to compute the 

In [3]:
from math import log

df2019['alphas'] = (np.log(df2019['Sample'].astype(float)) / np.log(6)) / 30
df2015['alphas'] = (np.log(df2015['Sample'].astype(float)) / np.log(6)) / 30
print(df2019['alphas'].median())
def ewm(arr, alphas):
    """
    Calculate the EMA of an array arr
    :param arr: numpy array of floats
    :param alpha: float between 0 and 1
    :return: numpy array of floats
    """
    # initialise ewm_arr
    ewm_arr = np.zeros_like(arr)
    ewm_arr[0] = arr[0]
    for t in range(1,arr.shape[0]):
        ewm_arr[t] = alphas[t]*arr[t] + (1 - alphas[t-1])*ewm_arr[t-1]
    return ewm_arr


0.1285283152857514


In [4]:
def apply_ewm(df):
    df['EWM_Liberal'] = ewm(df['Liberal'], df['alphas'])
    df['EWM_Conservative'] = ewm(df['Conservative'], df['alphas'])
    df['EWM_NDP'] = ewm(df['NDP'], df['alphas'])
    df['EWM_Green'] = ewm(df['Green'], df['alphas'])
    df['EWM_BQ'] = ewm(df['Bloc QC'], df['alphas'])
    return df
df2019 = apply_ewm(df2019)
df2015 = apply_ewm(df2015)
print(df2019[['EWM_Liberal', 'EWM_Conservative', 'EWM_NDP', 'EWM_Green', 'EWM_BQ']].tail())
print("Compared to 2015")
print(df2015[['EWM_Liberal', 'EWM_Conservative', 'EWM_NDP', 'EWM_Green', 'EWM_BQ']].tail())

            EWM_Liberal  EWM_Conservative    EWM_NDP  EWM_Green    EWM_BQ
Date                                                                     
2019-03-25    32.957569         37.878450  16.143103        NaN  3.952589
2019-03-27    31.266392         36.644434  16.125300        NaN  3.930004
2019-03-27    31.103601         37.075782  16.751927        NaN  4.067549
2019-03-29    30.700514         35.904989  16.344751        NaN  3.955556
2019-04-05    30.936716         35.812029  16.412058        NaN  3.960121
Compared to 2015
            EWM_Liberal  EWM_Conservative    EWM_NDP  EWM_Green    EWM_BQ
Date                                                                     
2015-10-17    36.475697         31.428351  22.634243   4.565095  4.573819
2015-10-17    35.881739         31.335434  22.226604   4.642995  4.650449
2015-10-18    36.049868         31.572887  22.077663   4.802637  4.709312
2015-10-18    36.288814         31.105761  21.619596   4.521451  4.844647
2015-10-18    36.2737

| Alpha Equation | Mean Alpha | Error Lib | Error Con | Error NDP | Err Green | Err BQ |
|--------------|----------|---------|---------|---------|---------|------|

In [5]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import BayesianRidge
clf = BayesianRidge()
clf.fit(df_results['Conservative'].values.reshape(-1, 1) / 100, df_results['Liberal - Seats'].values.reshape(-1, 1) / 335)

  y = column_or_1d(y, warn=True)


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)

In [6]:
clf.predict([[0.30676]])

array([0.62194802])

In [7]:
from fbprophet import Prophet

df_combined['Date', 'Liberal']

ModuleNotFoundError: No module named 'fbprophet'