In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json
from config import api_key

from census import Census
from us import states

In [None]:
# what is going on?

## Defining three functions:
* census_data(year): return census data of that year.
* vote_data(year): return vote data of that year.
* get_dataset(year): merge census and vote data, clean and format them.
* Example usage: data_2016 = get_dataset(2016)

In [None]:
# census_api() function, returning the census data of a given year.
def census_data(YEAR):
    c = Census(api_key, year=YEAR)
    raw_data = c.acs5.get(('NAME', 'B19013_001E', 'B19301_001E', 'B23025_003E',
                       'B23025_004E', 'B23025_005E', 'B01003_001E', "B01002_001E", "B17001_002E",'B01002_002E','B01002_003E',
                          'B02001_002E', 'B02001_003E', 'B02001_005E','B03001_003E','B02001_004E','B25035_001E'), {'for':'state:*'})
    census_df_raw = pd.DataFrame(raw_data)
    census_df_raw = census_df_raw.rename(columns = {'state': 'State#',
                          'NAME': 'State', 
                          'B01003_001E':'Total_population',
                          'B23025_003E':'Employable_civilians',
                          'B23025_004E':'Employed_civilians', 
                          'B23025_005E':'Unemployed_civilians', 
                          'B19013_001E':'Income_median',
                          'B19301_001E':'Income_per_capita',
                          "B01002_001E":'median age',
                          "B17001_002E":'Poverty_count',
                          'B01002_002E':'median_male_age',
                          'B01002_003E':'median_female_age',
                          'B02001_002E':'population_white_alone',
                          'B02001_003E':'population_black_alone',
                          'B02001_005E':'population_asian_alone',
                          'B03001_003E':'population_hispanic_origin',
                          'B02001_004E':'population_american_indian_alone',
                          'B25035_001E':'median_house_construction_year'                                                             
    })
    census_df_raw = census_df_raw.sort_values('State#').reset_index(drop=True)
    # reformatting census data
    census_df = census_df_raw.loc[:,['State', 'Total_population', 'Income_median', 'Income_per_capita','median_male_age','median_female_age',
                                    'population_white_alone','population_black_alone', 'population_asian_alone','population_hispanic_origin',
                                    'population_american_indian_alone', 'median_house_construction_year']]
    census_df['Poverty_rate'] = census_df_raw.Poverty_count/census_df_raw.Total_population
    census_df['Unemployment_rate'] = census_df_raw.Unemployed_civilians/census_df_raw.Employable_civilians
    state_df = pd.read_csv('Resources/state_centroids.csv')
    census_df = pd.merge(census_df, state_df, on = 'State')
    census_df = census_df.set_index('State')
    return census_df

In [None]:
def vote_data(YEAR):
    demo_raw = vote_df.loc[(vote_df.year == YEAR)&(vote_df.party == 'democrat'), ['state', 'candidatevotes']]
    demo = demo_raw.groupby('state')['candidatevotes'].sum()
    rep_raw = vote_df.loc[(vote_df.year == YEAR)&(vote_df.party == 'republican'),['state', 'candidatevotes']]
    rep = rep_raw.groupby('state')['candidatevotes'].sum()
    total = vote_df.loc[(vote_df.year == YEAR), ['state', 'totalvotes']].groupby('state').mean()['totalvotes'] 
    vote_df_year = pd.concat([demo, rep, total],axis=1)
    vote_df_year.columns = [str(YEAR)+'_democrat_votes', str(YEAR)+'_republican_votes', str(YEAR)+'_total_votes']
    return vote_df_year

In [None]:
# Merging data for a given year. The function takes 3 arguments: dataframe1, dataframe2, year
def get_dataset(YEAR):
    dataset = census_data(YEAR).merge(vote_data(YEAR), left_index=True, right_index=True)
    dataset['votes difference'] = dataset[str(YEAR)+'_democrat_votes'] - dataset[str(YEAR)+'_republican_votes']
    dataset.loc[dataset['votes difference']<0, 'winner'] = 'Republican'
    dataset.loc[dataset['votes difference']>0, 'winner'] = 'Democrat'
    return dataset

In [None]:
#TK Code

In [None]:
file_to_load = "Resources/RaceAgeVotes.csv"

# Read Purchasing File and store into Pandas data frame
df = pd.read_csv(file_to_load, sep=',', header=(0))

age_df = pd.DataFrame()
age_df = df[['Year','Voted_18-24%','Voted_25-44%','Voted_45-64%','Voted_65andOver%']]
age_plot_df = age_df[(age_df['Year'] < 2018)]
age_plot_df.dropna()

In [None]:
plt.plot(age_plot_df['Year'], age_plot_df['Voted_18-24%'], linewidth=3, marker="o", color="blue", label='18-24yrs')
plt.plot(age_plot_df['Year'], age_plot_df['Voted_25-44%'], linewidth=3, marker="x", color="red", label='25-44yrs')
plt.plot(age_plot_df['Year'], age_plot_df['Voted_45-64%'], linewidth=3, marker="^", color="green", label='45-64yrs')
plt.plot(age_plot_df['Year'], age_plot_df['Voted_65andOver%'], linewidth=3, marker="*", color="orange", label='65yrs & Over')

plt.title("Voting Rates Over Time for the Voting-Age Population")
plt.xlabel("Year")
plt.ylabel("Percentage Voted")
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 20
fig_size[1] = 5
plt.rcParams["figure.figsize"] = fig_size
plt.legend()
plt.savefig('TK-results/VotingAgeChart.png')

In [None]:
race_df = df[['Year','Total_Registered_population%','Voted_White_population%','Voted_Black_ population%','Voted_Asian_population%','Voted_Hispanic_population%']]
race_df.dropna()
race_plot_df = race_df[(age_df['Year'] < 2018) & (age_df['Year'] >= 1978)]
year = race_plot_df['Year']
White = race_plot_df['Voted_White_population%']
Black = race_plot_df['Voted_Black_ population%']
Asian = race_plot_df['Voted_Asian_population%']
Hispanic = race_plot_df['Voted_Hispanic_population%']
Registered = race_plot_df['Total_Registered_population%']

x = np.arange(len(year))  # the label locations
width = .20

fig, ax = plt.subplots()
ax2 = ax.twinx()
rects1 = ax.bar(x + .30, White, width, label = 'White',color = 'lightblue')
rects2 = ax.bar(x + .10, Black, width, label = 'Black',color = 'pink')
rects3 = ax.bar(x - .10, Asian, width, label = 'Asian',color = 'lightgreen')
rects4 = ax.bar(x - .30, Hispanic, width, label = 'Hispanic',color = 'orange')
ax2.plot(Registered,marker='o', markerfacecolor='blue', markersize=12, color='black', linewidth=4, label ='Registered Vote')

ax.set_ylabel('Percentage Voted')
ax2.set_ylabel('Percentage Registered')
ax.set_title('Voting Rates Over Time for the Race Population')
ax.set_xticks(x)
ax.set_xticklabels(year)
ax.legend(loc="best")

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)

fig.tight_layout()

plt.show()
fig.savefig('TK-results/VotingRaceChart.png')

In [None]:
#TK code ends

In [None]:
#Adam code

In [None]:
#Questions
#Are there any trends in how states voted (republican or democrat) over time? 
#How are those trends shifting over time? Which states experienced larger shifts in voting preference?
#Is there correlation between unemployment rates by state and voting preferences?

#Datasets: Harvard Election Results Data by state, Iowa State University Unemployment rate history by state, State_centroids.csv



In [None]:
#Presidential file to dataframe called df
file = "Resources/1976-2016-president.tab"
df = pd.read_csv(file,sep='\t',header=(0))
#df.tail(40)

In [None]:
party = df.groupby("party")
party = party["candidatevotes"].sum()
party = pd.DataFrame(party.sort_values(ascending=False))
#party.head(40)

In [None]:
party_state = df.groupby(["party", "state"])
#party_state.sum()

In [None]:
#list just republican/democrat
parties = list(df["party"].unique())
parties
#would consider conservative, republican, and liberal party democrat. need to combine. 
combined_dem_rep = df.replace("conservative","republican")
combined_dem_rep = combined_dem_rep.replace("liberal party", "democrat")
combined_dem_rep = combined_dem_rep.replace("democratic-farmer-labor","democrat")
dems = combined_dem_rep.loc[combined_dem_rep["party"] == "democrat"]
repubs = combined_dem_rep.loc[combined_dem_rep["party"] == "republican"]
dems_repubs = dems.append(repubs)
state_count = dems_repubs.groupby(["party", "state"])
state_count = pd.DataFrame(state_count[["candidatevotes", "totalvotes"]].sum())
#state_count

In [None]:
dem_votes_by_state = dems[["state","candidatevotes","totalvotes"]]
dem_votes_by_state = dem_votes_by_state.groupby("state")
dem_votes_by_state = dem_votes_by_state.sum()
len(dem_votes_by_state)

repubs_votes_by_state = repubs[["state","candidatevotes","totalvotes"]]
repubs_votes_by_state = repubs_votes_by_state.groupby("state")
repubs_votes_by_state = repubs_votes_by_state.sum()
#repubs_votes_by_state

dems_merged_repubs = pd.merge(dem_votes_by_state, repubs_votes_by_state, on="state", how="outer")
dems_merged_repubs = dems_merged_repubs.rename(columns = {"candidatevotes_x":"democrat votes",
                                                         "candidatevotes_y": "republican votes"})
dems_merged_repubs = dems_merged_repubs[['democrat votes', 'republican votes']]
dems_merged_repubs["Difference%ofTotal"] = (dems_merged_repubs["democrat votes"] - dems_merged_repubs["republican votes"])/(dems_merged_repubs["democrat votes"]+dems_merged_repubs["republican votes"])
dems_merged_repubs = dems_merged_repubs.sort_values("Difference%ofTotal", ascending=False)
#dems_merged_repubs



In [None]:
plots = list(dems_merged_repubs["Difference%ofTotal"])
x_axis = list(dems_merged_repubs.index)
colors = ["blue",
          "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",]
plt.figure(figsize=(40,20))
plt.bar(x_axis, plots, color=colors, alpha=0.5, align="center")
plt.xlabel("States", fontsize=33)
plt.ylabel("democrat - republican votes / total votes", fontsize=29)
plt.title("Difference from total democrat votes as % of total votes 1976 - 2016", fontsize=33)
plt.xticks(rotation=90, fontsize=33)
plt.yticks(fontsize=33)
plt.savefig("Adam_Output/OverallVotePreference.png")
plt.show()


In [None]:
# 1980 - 1996
dems_repubs_1980_forward = dems_repubs.loc[dems_repubs["year"]>=1980]
dems_repubs_1980_1996 = dems_repubs_1980_forward.loc[dems_repubs_1980_forward["year"]<=1996]
#dems_repubs_1980_1996
dems_repubs_2000_2016 = dems_repubs_1980_forward.loc[dems_repubs_1980_forward["year"]>= 2000]
#dems_repubs_2000_2016

In [None]:
# plot for 1980 - 1996 

early_dems = dems_repubs_1980_1996.loc[dems_repubs_1980_1996["party"]=="democrat"]
early_dems = early_dems.groupby("state")
early_dems = pd.DataFrame(early_dems["candidatevotes"].sum())
#early_dems

early_repubs = dems_repubs_1980_1996.loc[dems_repubs_1980_1996["party"]=="republican"]
early_repubs = early_repubs.groupby("state")
early_repubs = pd.DataFrame(early_repubs["candidatevotes"].sum())
#early_repubs

early_merge = pd.merge(early_dems, early_repubs, how="outer", on="state")
early_merge = early_merge.rename(columns = {"candidatevotes_x": "democrat votes 1980-1996",
                                           "candidatevotes_y": "republican votes 1980-1996"})
early_merge["Difference%ofTotal"] = (early_merge["democrat votes 1980-1996"] - early_merge["republican votes 1980-1996"])/(early_merge["democrat votes 1980-1996"]+early_merge["republican votes 1980-1996"])
early_merge

late_dems = dems_repubs_2000_2016.loc[dems_repubs_2000_2016["party"]=="democrat"]
late_dems = late_dems.groupby("state")
late_dems = pd.DataFrame(late_dems["candidatevotes"].sum())
#late_dems

late_repubs = dems_repubs_2000_2016.loc[dems_repubs_2000_2016["party"]=="republican"]
late_repubs = late_repubs.groupby("state")
late_repubs = pd.DataFrame(late_repubs["candidatevotes"].sum())
#late_repubs

late_merge = pd.merge(late_dems, late_repubs, how="outer", on="state")
late_merge = late_merge.rename(columns = {"candidatevotes_x": "democrat votes 2000-2016",
                                         "candidatevotes_y": "republican votes 2000-2016"})
late_merge["Difference%ofTotal_late"] = (late_merge["democrat votes 2000-2016"] - late_merge["republican votes 2000-2016"])/(late_merge["democrat votes 2000-2016"]+late_merge["republican votes 2000-2016"])
late_merge

early_late_merge = pd.merge(early_merge, late_merge, how="outer", on="state")
early_late_merge["change"] = early_late_merge["Difference%ofTotal_late"] - early_late_merge["Difference%ofTotal"]
early_late_merge = early_late_merge.sort_values("change", ascending=False)
#negative indicates change towards republican - positive indicates change towards democrat

#early_late_merge


In [None]:
plots = list(early_late_merge["change"])
x_axis = list(early_late_merge.index)
colors = ["blue",
          "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",]
plt.figure(figsize=(40,20))
plt.bar(x_axis, plots, color=colors, alpha=0.5, align="center")
plt.xlabel("States", fontsize=33)
plt.ylabel("shift in favored party - positive is shift towards dem, negative shift towards repubs", fontsize=29)
plt.title("Shift in party preference by states", fontsize=33)
plt.xticks(rotation=90)
plt.tick_params(labelsize=33)
plt.savefig("Adam_Output/VotePreferenceShift.png")
plt.show()


In [None]:
early_late_merge.loc[early_late_merge["change"] > 0, "shift"] = "Democrat"
early_late_merge.loc[early_late_merge["change"] < 0, "shift"] = "Republican"
early_late_merge["absolute value"] = early_late_merge["change"].abs()
#early_late_merge


In [None]:
state_df = pd.read_csv("Resources/state_centroids.csv")
state_df = state_df[state_df.State != "Puerto Rico"]
state_df = state_df.rename(columns={"State":"state"})
early_late_merge_latlong = pd.merge(early_late_merge, state_df, on="state", how="outer")
#early_late_merge_latlong.head()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
fig = plt.figure(figsize=(18, 10))
ax1 = fig.add_subplot(121, projection="3d")
x = list(early_late_merge_latlong["Longitude"])
y = list(early_late_merge_latlong["Latitude"])
top = list(early_late_merge_latlong["absolute value"])
colors = ["blue",
          "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "blue",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",
         "red",]
bottom = 0
width = depth = 1
ax1.bar3d(x, y, bottom, width, depth, top, shade=True, color=colors)
ax1.set_title('Shift in Political Party Preference by State')
ax1.set_xlabel("Longitude")
ax1.set_ylabel("Latitude")
ax1.set_zlabel("Absolute Value Change in Democrat - Republican / total votes")
plt.savefig("Adam_Output/VotePreferenceShift3D.png")

In [None]:
#dems_repubs

dems_repubs_dem = dems_repubs.loc[dems_repubs["party"]=="democrat"]
dems_repubs_dem = dems_repubs_dem.groupby(["state", "year"])
dems_repubs_dem = pd.DataFrame(dems_repubs_dem["candidatevotes"].sum())
#dems_repubs_dem

dems_repubs_repubs = dems_repubs.loc[dems_repubs["party"]=="republican"]
dems_repubs_repubs = dems_repubs_repubs.groupby(["state", "year"])
dems_repubs_repubs = pd.DataFrame(dems_repubs_repubs["candidatevotes"].sum())
#dems_repubs_repubs

dems_repubs_merge = pd.merge(dems_repubs_dem, dems_repubs_repubs, on=["state","year"], how="outer")
dems_repubs_merge = dems_repubs_merge.rename(columns = {"candidatevotes_x":"democrat votes",
                                                       "candidatevotes_y":"republican votes"})
dems_repubs_merge["democrat votes % of total"] = dems_repubs_merge["democrat votes"] / (dems_repubs_merge["democrat votes"] + dems_repubs_merge["republican votes"])
#dems_repubs_merge




In [None]:
#early_late_merge_latlong.head()


In [None]:
#function for midpoint for the plotly graph
max_change = early_late_merge_latlong['change'].max()
range_change = early_late_merge_latlong['change'].max()-early_late_merge_latlong['change'].min()
max_div_range = max_change/range_change
midpoint = 1 - max_div_range

In [None]:
import plotly.express as px
locations = list(early_late_merge_latlong["Code"])

colors = list(early_late_merge_latlong["change"])
fig = px.choropleth(locations=locations,color_continuous_scale = [(0, "red"), (midpoint, "white"), (1,"blue")],locationmode="USA-states", color=colors, scope="usa")
fig.update_layout(
    title="United States - change in voter preference over time",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

In [None]:
unemployment_df = pd.read_csv("Resources/unemployment_data.csv")
unemployment_df = unemployment_df.melt(id_vars="Area", 
        var_name="Date", 
        value_name="Unemployment Rate")
unemployment_df = unemployment_df.sort_values(["Area", "Date"])
unemployment_df = unemployment_df.reset_index()
unemployment_df = unemployment_df.drop(columns=["index"])
unemployment_df["Date"] = unemployment_df["Date"].astype('int')
#unemployment_df.head()


In [None]:
early_unemployment_avg = unemployment_df.loc[unemployment_df["Date"] <=1996]
early_unemployment_avg = early_unemployment_avg.groupby("Area")
early_unemployment_avg = pd.DataFrame(early_unemployment_avg["Unemployment Rate"].mean())
early_unemployment_avg = early_unemployment_avg.rename(columns={"Unemployment Rate": "Early Unemployment Rate Avg"})
#early_unemployment_avg

late_unemployment_avg = unemployment_df.loc[(unemployment_df["Date"] >= 1997) & (unemployment_df["Date"] <= 2016)]
late_unemployment_avg = late_unemployment_avg.groupby("Area")
late_unemployment_avg = pd.DataFrame(late_unemployment_avg["Unemployment Rate"].mean())
late_unemployment_avg = late_unemployment_avg.rename(columns={"Unemployment Rate": "Late Unemployment Rate Avg"})
#late_unemployment_avg

early_late_unemploy_merge = pd.merge(early_unemployment_avg, late_unemployment_avg, on="Area", how="outer")
#early_late_unemploy_merge

In [None]:
early_late_unemploy_merge = early_late_unemploy_merge.reset_index()

early_late_unemploy_merge = early_late_unemploy_merge.rename(columns={"Area":"state"})

unemployment_voting_merge = pd.merge(early_late_merge_latlong, early_late_unemploy_merge, on="state", how="outer")
unemployment_voting_merge["Unemployment Avg Change"] = unemployment_voting_merge["Late Unemployment Rate Avg"] - unemployment_voting_merge["Early Unemployment Rate Avg"]
unemployment_voting_merge = unemployment_voting_merge.dropna()
#unemployment_voting_merge


In [None]:
voteplots = list(unemployment_voting_merge["change"])
unemploymentplots = list(unemployment_voting_merge["Unemployment Avg Change"])
labels=list(unemployment_voting_merge["state"])
plt.figure(figsize=(20,5))
plt.subplot(121)
plt.plot(labels, voteplots)
plt.title("Shift in Voter Preference")
plt.xlabel("State")
plt.ylabel("Positive: democrat shift, Negative: republican shift")
plt.xticks(rotation=90)
plt.subplot(122)
plt.plot(labels, unemploymentplots, color = "red")
plt.title("Shift in Avg Unemployment Rate (1980-1996 vs 1997-2016)")
plt.xlabel("State")
plt.ylabel("Change in avg unemployment rate (%)")
plt.xticks(rotation=90)
plt.savefig("Adam_Output/voteshift_unemployment_linecharts.png")
plt.show()


In [None]:
#Scatter Plot
from scipy.stats import linregress

def lin_reg(x,y):
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
    regress_values = x * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    plt.scatter(x,y)
    plt.plot(x,regress_values, "r-")
    plt.xlabel(x.name)
    plt.ylabel(y.name)
    plt.title(f'{x.name} vs {y.name}')
    plt.annotate(line_eq,(min(x),min(y)),fontsize=15, color="red")
    plt.savefig("Adam_Output/scatter_regression")
    plt.show()
    print(f'The r value is: {rvalue}')
    
random_df = pd.DataFrame({"voteplots": voteplots, "unemploymentplots":unemploymentplots})
random_df = random_df.rename(columns = {"voteplots": "Shift in voter preference",
                                       "unemploymentplots": "Change in Avg Unemployment Rate"})
x = random_df["Shift in voter preference"]
y = random_df["Change in Avg Unemployment Rate"]

lin_reg(x, y)


In [None]:
#Run an independent T-Test 
#Null hypothesis: there is no statistically significant relationship between voting preference and changes in unemployment rates
import scipy.stats as stats
stats.ttest_ind(random_df["Shift in voter preference"], random_df["Change in Avg Unemployment Rate"], equal_var=False)

In [None]:
#Random Code - Adam

In [None]:
dems_repubs_minnesota = dems_repubs.loc[dems_repubs["state"] == "Minnesota"]
dems_repubs_minnesota = dems_repubs_minnesota.groupby("party").sum()
dems_repubs_minnesota = dems_repubs_minnesota[["candidatevotes","totalvotes"]]
#dems_repubs_minnesota

In [None]:
#list(df["party"].unique())

In [None]:
#just curious about write in candidates
just_write_in = df.loc[df["writein"] == True]
#just_write_in

In [None]:
write_in_by_state = just_write_in.groupby("state")
write_in_by_state = pd.DataFrame(write_in_by_state["candidatevotes"].sum())
sorted_write_in_by_state = write_in_by_state["candidatevotes"].sort_values(ascending=False)
#sorted_write_in_by_state

In [None]:
candidate = df.groupby("candidate")
temp = candidate.sum()
temp["% of totalvotes"] = temp["candidatevotes"]/temp["totalvotes"]
temp = pd.DataFrame(temp["% of totalvotes"].sort_values(ascending=False))
#temp.head(25)

In [None]:
year_2016 = df.loc[df["year"]==2016]

by_candidate_2016 = year_2016.groupby("candidate")
total_votes = by_candidate_2016["candidatevotes"].sum().sort_values(ascending=False)
#total_votes

In [None]:
year_2012 = df.loc[df["year"]==2012]

by_candidate_2012 = year_2012.groupby("candidate")
total_votes_2012 = by_candidate_2012["candidatevotes"].sum().sort_values(ascending=False)
#total_votes_2012

In [None]:
#Adam code ends

# Ryan code

In [None]:
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Preparing the vote data of a given year.
csvfile = 'Resources/1976-2016-president.tab'
vote_df = pd.read_csv(csvfile, sep='\t', header=(0))
# Fixing the party label for 2012 minnesota democrat votes
vote_df = vote_df.replace('democratic-farmer-labor', 'democrat')

In [None]:
data_2016 = get_dataset(2016)
data_2012 = get_dataset(2012)
census2018 = census_data(2018)

In [None]:
data_2012_2016 = pd.concat([data_2012.reset_index(), data_2016.reset_index()], ignore_index=True, axis=0)
# data_2012_2016

In [None]:
X = data_2012_2016[['Total_population', 'Income_median', 'Income_per_capita', 'Unemployment_rate',
               'Poverty_rate', 'Latitude', 'Longitude', 'median_male_age','median_female_age',
              'population_white_alone','population_black_alone', 'population_asian_alone',
               'population_hispanic_origin', 'population_american_indian_alone', 
               'median_house_construction_year']]
y = data_2012_2016['winner']
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 5)

## Model training

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model_master = RandomForestClassifier(n_estimators=100)
rf_model_master.fit(X, y)

In [None]:
X_test_2012 = data_2012[['Total_population', 'Income_median', 'Income_per_capita', 'Unemployment_rate',
               'Poverty_rate', 'Latitude', 'Longitude', 'median_male_age','median_female_age',
              'population_white_alone','population_black_alone', 'population_asian_alone',
               'population_hispanic_origin', 'population_american_indian_alone', 
               'median_house_construction_year']]
y_test_2012 = data_2012['winner']
y_predict_2012 = rf_model_master.predict(X_test_2012)
cm_2012 = confusion_matrix(y_test_2012, y_predict_2012)
sns.heatmap(cm_2012, annot=True)

In [None]:
X_test_2016 = data_2016[['Total_population', 'Income_median', 'Income_per_capita', 'Unemployment_rate',
               'Poverty_rate', 'Latitude', 'Longitude', 'median_male_age','median_female_age',
              'population_white_alone','population_black_alone', 'population_asian_alone',
               'population_hispanic_origin', 'population_american_indian_alone', 
               'median_house_construction_year']]
y_test_2016 = data_2016['winner']
y_predict_2016 = rf_model_master.predict(X_test_2016)
cm_2016 = confusion_matrix(y_test_2016, y_predict_2016)
sns.heatmap(cm_2016, annot=True)

### Predicting 2020 election using 2018 census data

In [None]:
from sklearn.externals import joblib
# joblib.dump(rf_model_master, 'ryan_results/rf_model_master.sav')
rf_model_master = joblib.load('ryan_results/rf_model_master.sav')

In [None]:
# Predict using the model
# Input is census2018, output is vote_2018
vote_2018 = rf_model_master.predict(census2018[['Total_population', 'Income_median', 'Income_per_capita', 'Unemployment_rate',
               'Poverty_rate', 'Latitude', 'Longitude', 'median_male_age','median_female_age',
              'population_white_alone','population_black_alone', 'population_asian_alone',
               'population_hispanic_origin', 'population_american_indian_alone', 
               'median_house_construction_year']])

In [None]:
winner_2018 = pd.DataFrame(index=census2018.index, data = vote_2018, columns=['winner'])
electoral_votes = pd.read_csv('Resources/Electoral_votes_2020.csv')
prediction_2020 = winner_2018.merge(electoral_votes, on='State')
party_counts = prediction_2020.winner.value_counts()
final_votes = prediction_2020.groupby('winner').sum()
final_votes['party counts'] = party_counts
final_votes

#### Comparing the predicted 2020 with 2012&2016

In [None]:
compare = prediction_2020.merge(data_2016.winner, left_on='State', right_index=True)
compare = compare.rename(columns={
    'winner_x': 2020,
    'winner_y': 2016,
})
compare = compare.merge(data_2012.winner, left_on='State', right_index=True)
compare = compare.merge(data_2016.Code, left_on='State', right_index=True)
compare = compare.rename(columns={'winner':2012})
compare = compare.set_index('State')
compare = compare[['Code', 2020, 2016, 2012, 'Number of Electoral Votes']]
compare.to_csv('ryan_results/compare.csv')

In [None]:
compare

In [None]:
geomap = compare.copy()
geomap.loc[geomap[2020] == 'Republican', 'Weights'] = geomap.loc[geomap[2020] == 'Republican']['Number of Electoral Votes'] * (-1)
geomap.loc[geomap[2020] == 'Democrat', 'Weights'] = geomap.loc[geomap[2020] == 'Democrat']['Number of Electoral Votes']
geomap[['Code', 2020, 'Number of Electoral Votes']].head()

In [None]:
import plotly.express as px
px.choropleth(locations=geomap.Code, color=geomap.Weights, 
              color_continuous_scale=[(0, 'red'), (0.408, 'white'), (1, 'blue')], 
              locationmode="USA-states", scope='usa')

# Ryan code ends

In [None]:
#Connor code

In [None]:
import scipy.stats as st
from scipy.stats import linregress


In [None]:

# Read in CSV Files
census_2012_df = pd.read_csv('Resources/2012_population_data.csv')
census_2016_df = pd.read_csv('Resources/2016_population_data.csv')

In [None]:
census_2012_df.head()

In [None]:
total_young_males = census_2012_df["Males 18 and 19 years"] + census_2012_df["Males 20 to 24"] + census_2012_df["Males 25 to 29 years"]
total_young_females = census_2012_df["Females 18 and 19 years"] + census_2012_df["Females 20 to 24"] + census_2012_df["Females 25 to 29 years"]
percent_males = total_young_males/census_2012_df["Total State Population"]
percent_females = total_young_females/census_2012_df["Total State Population"]
total_votes = census_2012_df["Democrat votes 2012"] + census_2012_df["Republican votes 2012"]
new_2012_df = pd.DataFrame({"State": census_2012_df["State"],
                            "Total Males 18 to 29":total_young_males,
                            "Total Females 18 to 29":total_young_females,
                            "Total People 18 to 29":total_young_males+total_young_females,
                            "Percent Males": percent_males*100,
                            "Percent Females": percent_females*100,
                            "Percent 18 to 29": (percent_males + percent_females)*100,
                            "Total Votes": total_votes,
                            "Percent Democrat": (census_2012_df["Democrat votes 2012"]/total_votes)*100,
                            "Percent Republican": (census_2012_df["Republican votes 2012"]/total_votes)*100
})
new_2012_df.head()

In [None]:
plt.bar(new_2012_df["State"],new_2012_df["Percent 18 to 29"])
plt.title("2012 18 to 29 y/o by State")
plt.xlabel("State")
plt.ylabel("Percent of 18 to 29 y/o")
plt.ylim(new_2012_df["Percent 18 to 29"].min()-1,new_2012_df["Percent 18 to 29"].max()+1)
plt.xticks(rotation=90)
plt.tight_layout

In [None]:
(slope, intercept, rvalue, pvalue, stderr) = linregress(new_2012_df["Percent 18 to 29"], new_2012_df["Percent Democrat"])
regress_values_d_2012 = new_2012_df["Percent 18 to 29"] * slope + intercept
line_eq_d_2012 = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
(slope, intercept, rvalue, pvalue, stderr) = linregress(new_2012_df["Percent 18 to 29"], new_2012_df["Percent Republican"])
regress_values_r_2012 = new_2012_df["Percent 18 to 29"] * slope + intercept
line_eq_r_2012 = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

In [None]:
plt.scatter(new_2012_df["Percent 18 to 29"],new_2012_df["Percent Democrat"],marker='o', c='blue')
plt.scatter(new_2012_df["Percent 18 to 29"],new_2012_df["Percent Republican"],marker='o', c='red')
plt.xlabel("Percentage of 18 to 29")
plt.ylabel("Percentage of Votes")
plt.title("Voter type vs 18 to 29 y/o")
correlation_2012 = st.pearsonr(new_2012_df["Percent 18 to 29"],new_2012_df["Percent Democrat"])
plt.plot(new_2012_df["Percent 18 to 29"],regress_values_d_2012,"r-")
plt.plot(new_2012_df["Percent 18 to 29"],regress_values_r_2012,"r-",c="blue")
#plt.annotate(line_eq_d_2012,(14,24),fontsize=15,color="red")
ttest_2012 = st.ttest_ind(new_2012_df["Percent 18 to 29"],new_2012_df["Percent Democrat"],equal_var=False)
print(f"The correlation between both factors is {round(correlation_2012[0],2)}")
print(f'The regression line for Democrats is {line_eq_d_2012}.')
print(f'The regression line for Republicans is {line_eq_r_2012}.')
ttest_2012

In [None]:
census_2016_df.head()

In [None]:
total_young_males_2016 = census_2016_df["Males 18 and 19 years"] + census_2016_df["Males 20 to 24"] + census_2016_df["Males 25 to 29 years"]
total_young_females_2016 = census_2016_df["Females 18 and 19 years"] + census_2016_df["Females 20 to 24"] + census_2016_df["Females 25 to 29 years"]
percent_males_2016 = total_young_males_2016/census_2016_df["Total State Population"].astype('float')
percent_females_2016 = total_young_females_2016/census_2016_df["Total State Population"].astype('float')
total_votes = census_2012_df["Democrat votes 2012"] + census_2012_df["Republican votes 2012"]
new_2016_df = pd.DataFrame({"State": census_2016_df["State"],
                            "Total Males 18 to 29": total_young_males,
                            "Total Females 18 to 29": total_young_females,
                            "Total People 18 to 29": total_young_males_2016 + total_young_females_2016,
                            "Percent Males": percent_males_2016*100,
                            "Percent Females": percent_females_2016*100,
                            "Percent 18 to 29": (percent_males_2016 + percent_females_2016)*100,
                            "Total Votes": total_votes,
                            "Percent Democrat": (census_2012_df["Democrat votes 2012"]/total_votes)*100,
                            "Percent Republican": (census_2012_df["Republican votes 2012"]/total_votes)*100
})
new_2016_df.head()

In [None]:
plt.bar(new_2016_df["State"],new_2016_df["Percent 18 to 29"])
plt.title("2016 18 to 29 y/o by State")
plt.xlabel("State")
plt.ylabel("Percent of 18 to 29 y/o")
plt.ylim(new_2016_df["Percent 18 to 29"].min()-1,new_2016_df["Percent 18 to 29"].max()+1)
plt.xticks(rotation=90)
plt.tight_layout

In [None]:
(slope, intercept, rvalue, pvalue, stderr) = linregress(new_2016_df["Percent 18 to 29"], new_2016_df["Percent Democrat"])
regress_values_d_2016 = new_2016_df["Percent 18 to 29"] * slope + intercept
line_eq_d_2016 = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
(slope, intercept, rvalue, pvalue, stderr) = linregress(new_2016_df["Percent 18 to 29"], new_2016_df["Percent Republican"])
regress_values_r_2016 = new_2016_df["Percent 18 to 29"] * slope + intercept
line_eq_r_2016 = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

In [None]:
plt.scatter(new_2016_df["Percent 18 to 29"],new_2016_df["Percent Democrat"],marker='o', c='blue')
plt.scatter(new_2016_df["Percent 18 to 29"],new_2016_df["Percent Republican"],marker='o', c='red')
plt.xlabel("Percentage of 18 to 29")
plt.ylabel("Percentage of Votes")
plt.title("Voter type vs 18 to 29 y/o")
correlation_2016 = st.pearsonr(new_2016_df["Percent 18 to 29"],new_2016_df["Percent Democrat"])
plt.plot(new_2016_df["Percent 18 to 29"],regress_values_d_2016,"r-")
plt.plot(new_2016_df["Percent 18 to 29"],regress_values_r_2016,"r-",c="blue")
#plt.annotate(line_eq_d_2012,(14,24),fontsize=15,color="red")
ttest_2016 = st.ttest_ind(new_2016_df["Percent 18 to 29"],new_2016_df["Percent Democrat"],equal_var=False)
print(f"The correlation between both factors is {round(correlation_2016[0],2)}")
print(f'The regression line for Democrats is {line_eq_d_2016}.')
print(f'The regression line for Republicans is {line_eq_r_2016}.')
ttest_2016

In [None]:
#Connor code ends