In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from functools import partial

from polling_simulator import Demographic, Variable, generate_electorate, run_elections, run_polls
from polling_simulator.aggregation import naive_aggregation, stratified_aggregation, weight_by_self_reported_turnout
from polling_simulator.core import Segmentation
from polling_simulator.distributions import truncated_gaussian_distribution
from polling_simulator.sampling import predefined_sample

original_palette = sns.color_palette()
modified_palette = [original_palette[0], original_palette[3]]
sns.set_palette(modified_palette)

Now, what happens when we don't have perfect insights into all the different ways people's voting preferences are determined? Polling becomes a lot harder. To demonstrate, we'll keep the original age distribution but add another variable, educational attainment, that we don't control for in our poll.

In [5]:
age = Variable("age", truncated_gaussian_distribution(18, 35, 18, 110))
education = Variable("education", partial(
    np.random.choice, np.array(["High School", "College", "Post-College"]), replace=True, p=np.array([0.4, 0.4, 0.2])
))

In [7]:
from dataclasses import dataclass
from itertools import product

@dataclass
class Parameterization:
    segment: Segmentation
    republican_modifier: float
    turnout_modifier: float = 1
    response_modifier: float = 1

age_parameterization = [
    Parameterization(age < 30, 0.5),
    Parameterization((age >= 30) & (age < 50), 1.0),
    Parameterization((age >= 50) & (age < 65), 1.5),
    Parameterization(age >= 65, 2)
]
education_parameterization = [
    Parameterization(education == "High School", 2),
    Parameterization(education == "College", 1),
    Parameterization(education == "Post-College", 0.5)
]

default_turnout_likelihood = 0.5
default_response_likelihood = 0.5
default_republican_likelihood = 0.5
actual_demographics = []
for age_parameters, education_parameters in product(age_parameterization, education_parameterization):
    turnout_likelihood = (
        default_turnout_likelihood * age_parameters.turnout_modifier * education_parameters.turnout_modifier
    )
    response_likelihood = (
        default_response_likelihood * age_parameters.response_modifier * education_parameters.response_modifier
    )
    republican_likelihood = (
        default_republican_likelihood * age_parameters.republican_modifier * education_parameters.republican_modifier
    )
    turnout_likelihood = 1 if turnout_likelihood > 1 else turnout_likelihood
    response_likelihood = 1 if response_likelihood > 1 else response_likelihood
    republican_likelihood = 1 if republican_likelihood > 1 else republican_likelihood
    democrat_likelihood = 1 - republican_likelihood
    actual_demographics.append(
        Demographic(
            turnout_likelihood=turnout_likelihood,
            response_likelihood=response_likelihood,
            candidate_preference={"Rep": republican_likelihood, "Dem": democrat_likelihood},
            population_segmentation=age_parameters.segment & education_parameters.segment
        )
    )
    #print(age_parameters, education_parameters)

In [8]:
import operator
str(operator.gt)

'<built-in function gt>'

In [None]:
age_lt_30 = Demographic(
    turnout_likelihood=0.7,
    response_likelihood=0.3,
    candidate_preference={"Rep": 0.2, "Dem": 0.8},
    population_segmentation=(age < 30)
)
age_between_30_50 = Demographic(
    turnout_likelihood=0.7,
    response_likelihood=0.5,
    candidate_preference={"Rep": 0.4, "Dem": 0.6},
    population_segmentation=(age >= 30) & (age < 50)
)
age_between_50_65 = Demographic(
    turnout_likelihood=0.7,
    response_likelihood=0.7,
    candidate_preference={"Rep": 0.6, "Dem": 0.4},
    population_segmentation=(age >= 50) & (age < 65)
)
age_ge_65 = Demographic(
    turnout_likelihood=0.7,
    response_likelihood=0.9,
    candidate_preference={"Rep": 0.75, "Dem": 0.25},
    population_segmentation=(age >= 65)
)
demographics = [age_lt_30, age_between_30_50, age_between_50_65, age_ge_65]

In [None]:
np.random.seed(123)
electorate = generate_electorate(1000000, demographics)

In [None]:
demographic_size = [
    demographic.population_segmentation.segment(electorate).sum() / len(electorate)
    for demographic in demographics
]
demographic_size

In [None]:
simulated_elections = run_elections(500, electorate)
simulated_elections["total_votes"] = simulated_elections.sum(axis=1)
simulated_elections["dem_voteshare"] = simulated_elections["Dem"] / simulated_elections["total_votes"]
simulated_elections["rep_voteshare"] = simulated_elections["Rep"] / simulated_elections["total_votes"]

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.histplot(
    simulated_elections[["dem_voteshare", "rep_voteshare"]], element="step", bins=100, ax=ax, alpha=0.75
)
ax.set_xlabel("Vote Share")
ax.set_ylabel("Number of Simulations")
ax.set_xlim([0.4, 0.6])

In [None]:
# This cell may take ~10 minutes
naive_poll_results = run_polls(
    num_polls=500,
    num_to_poll=1000,
    electorate=electorate,
    assumed_demographics=demographics,
    sampling_strategy=predefined_sample(max_num_attempts=1, screen_likely_voters=False),
    aggregation_strategy=naive_aggregation()
)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.histplot(
    naive_poll_results[["Dem", "Rep"]], element="step", bins=50, ax=ax, alpha=0.75
)
ax.set_xlabel("Vote Share")
ax.set_ylabel("Number of Simulations")
ax.set_xlim([0.40, 0.60])

In [None]:
# This cell may take ~10 minutes
stratified_poll_results = run_polls(
    num_polls=500,
    num_to_poll=1000,
    electorate=electorate,
    assumed_demographics=demographics,
    sampling_strategy=predefined_sample(max_num_attempts=1, screen_likely_voters=False),
    aggregation_strategy=stratified_aggregation(demographics, demographic_size)
)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.histplot(
    stratified_poll_results[["Dem", "Rep"]], element="step", bins=50, ax=ax, alpha=0.75
)
ax.set_xlabel("Vote Share")
ax.set_ylabel("Number of Simulations")
ax.set_xlim([0.40, 0.60])