In [None]:
import os
import pandas
import random
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB

def get_party_history(data, party):
    """Return election results per year for a given party, with state-by-state results as columns"""
    party_data = data[data['party_simplified']==party]
    party_history = None
    for state in states:
        history = party_data[party_data['state_po']==state][['year', 'percentvotes']]
        history = history.rename(columns={'percentvotes': state})
        if party_history is None:
            party_history = history
        else:
            party_history = party_history.join(history.set_index('year'), on='year')
    return party_history

# Load data file
fname = os.path.join('..', 'resource', 'asnlib', 'publicdata', '1976-2020-president.csv')
data = pandas.read_csv(fname)
# Add a column for the fraction of the vote that each candidate one
data['percentvotes'] = data['candidatevotes']/data['totalvotes']
# Record state codes that appear in the data
states = set(data['state_po'])


In [2]:
from itertools import combinations
import numpy as np


state = 'CA'


history = get_party_history(data,'DEMOCRAT')



best_states3 = sorted(states - {state}, key=lambda s: LinearRegression().fit(history[sorted(list({state, s}))], history[state]).score(history[sorted(list({state, s}))], history[state]), reverse=True)[:3]

best_states5 = sorted(states - {state}, key=lambda s: LinearRegression().fit(history[sorted(list({state, s}))], history[state]).score(history[sorted(list({state, s}))], history[state]), reverse=True)[:5]




In [3]:
state = 'CA'
party = 'DEMOCRAT'
history = get_party_history(data, party)
assert state not in best_states3, 'You haven\'t included California in your states'
assert len(set(best_states3) & states) == 3, 'You have not provided 3 states'
X = history[best_states3]
y = history[state]

reg = LinearRegression()
reg.fit(X, y)
score = reg.score(X, y)
print(f'Your chosen states ({", ".join(sorted(best_states3))}) lead to a model with a score of {score:.2f}')

Your chosen states (NJ, OR, RI) lead to a model with a score of 0.99


In [4]:
assert score > 0.9, 'Your score is not greater than 0.9'
print('Your model is sufficiently accurate')

Your model is sufficiently accurate


In [5]:
state = 'CA'
party = 'DEMOCRAT'
history = get_party_history(data, party)
assert state not in best_states5, 'You haven\'t included California in your states'
assert len(set(best_states5) & states) == 5, 'You have not provided 5 states'
X = history[best_states5]
y = history[state]

reg = LinearRegression()
reg.fit(X, y)
score = reg.score(X, y)
print(f'Your chosen states ({", ".join(sorted(best_states5[:5]))}) lead to a model with a score of {score:.2f}')


Your chosen states (IL, KS, NJ, OR, RI) lead to a model with a score of 0.99


In [6]:
assert score > 0.95, 'Your score is not greater than 0.95'
print('Your model is sufficiently accurate')

Your model is sufficiently accurate
