In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from config import api_key
from census import Census
from us import states

## Defining three functions:
* census_data(year): return census data of that year.
* vote_data(year): return vote data of that year.
* get_dataset(year): merge census and vote data, clean and format them.
* Example usage: data_2016 = get_dataset(2016)

In [None]:
# census_api() function, returning the census data of a given year.
def census_data(YEAR):
    c = Census(api_key, year=YEAR)
    raw_data = c.acs5.get(('NAME', 'B19013_001E', 'B19301_001E', 'B23025_003E',
                       'B23025_004E', 'B23025_005E', 'B01003_001E', "B01002_001E", "B17001_002E",'B01002_002E','B01002_003E',
                          'B02001_002E', 'B02001_003E', 'B02001_005E','B03001_003E','B02001_004E','B25035_001E'), {'for':'state:*'})
    census_df_raw = pd.DataFrame(raw_data)
    census_df_raw = census_df_raw.rename(columns = {'state': 'State#',
                          'NAME': 'State', 
                          'B01003_001E':'Total_population',
                          'B23025_003E':'Employable_civilians',
                          'B23025_004E':'Employed_civilians', 
                          'B23025_005E':'Unemployed_civilians', 
                          'B19013_001E':'Income_median',
                          'B19301_001E':'Income_per_capita',
                          "B01002_001E":'median age',
                          "B17001_002E":'Poverty_count',
                          'B01002_002E':'median_male_age',
                          'B01002_003E':'median_female_age',
                          'B02001_002E':'population_white_alone',
                          'B02001_003E':'population_black_alone',
                          'B02001_005E':'population_asian_alone',
                          'B03001_003E':'population_hispanic_origin',
                          'B02001_004E':'population_american_indian_alone',
                          'B25035_001E':'median_house_construction_year'                         
                                            
    })
    census_df_raw = census_df_raw.sort_values('State#').reset_index(drop=True)
    # reformatting census data
    census_df = census_df_raw.loc[:,['State', 'Total_population', 'Income_median', 'Income_per_capita','median_male_age','median_female_age',
                                    'population_white_alone','population_black_alone', 'population_asian_alone','population_hispanic_origin',
                                    'population_american_indian_alone', 'median_house_construction_year']]
    census_df['Poverty_rate'] = census_df_raw.Poverty_count/census_df_raw.Total_population
    census_df['Unemployment_rate'] = census_df_raw.Unemployed_civilians/census_df_raw.Employable_civilians
    state_df = pd.read_csv('Resources/state_centroids.csv')
    census_df = pd.merge(census_df, state_df, on = 'State')
    census_df = census_df.set_index('State')
    return census_df

In [None]:
def vote_data(YEAR):
    demo_raw = vote_df.loc[(vote_df.year == YEAR)&(vote_df.party == 'democrat'), ['state', 'candidatevotes']]
    demo = demo_raw.groupby('state')['candidatevotes'].sum()
    rep_raw = vote_df.loc[(vote_df.year == YEAR)&(vote_df.party == 'republican'),['state', 'candidatevotes']]
    rep = rep_raw.groupby('state')['candidatevotes'].sum()
    total = vote_df.loc[(vote_df.year == YEAR), ['state', 'totalvotes']].groupby('state').mean()['totalvotes'] 
    vote_df_year = pd.concat([demo, rep, total],axis=1)
    vote_df_year.columns = [str(YEAR)+'_democrat_votes', str(YEAR)+'_republican_votes', str(YEAR)+'_total_votes']
    return vote_df_year

In [None]:
# Merging data for a given year. The function takes 3 arguments: dataframe1, dataframe2, year
def get_dataset(YEAR):
    dataset = census_data(YEAR).merge(vote_data(YEAR), left_index=True, right_index=True)
    dataset['votes difference'] = dataset[str(YEAR)+'_democrat_votes'] - dataset[str(YEAR)+'_republican_votes']
    dataset.loc[dataset['votes difference']<0, 'winner'] = 'Republican'
    dataset.loc[dataset['votes difference']>0, 'winner'] = 'Democrat'
    return dataset

# TK Code

In [None]:
print ("I made changes to the file")
print("Added new print stmt")
print("last stmt")

# TK code ends

# Adam code starts

In [None]:
#Presidential file to dataframe called df
file = "Resources/1976-2016-president.tab"
df = pd.read_csv(file,sep='\t',header=(0))
df.tail(40)

In [None]:
#just curious about write in candidates
just_write_in = df.loc[df["writein"] == True]
just_write_in

In [None]:
write_in_by_state = just_write_in.groupby("state")
write_in_by_state = pd.DataFrame(write_in_by_state["candidatevotes"].sum())
sorted_write_in_by_state = write_in_by_state["candidatevotes"].sort_values(ascending=False)
sorted_write_in_by_state

In [None]:
candidate = df.groupby("candidate")
temp = candidate.sum()
temp["% of totalvotes"] = temp["candidatevotes"]/temp["totalvotes"]
temp = pd.DataFrame(temp["% of totalvotes"].sort_values(ascending=False))
temp.head(25)

In [None]:
year_2016 = df.loc[df["year"]==2016]

by_candidate_2016 = year_2016.groupby("candidate")
total_votes = by_candidate_2016["candidatevotes"].sum().sort_values(ascending=False)
total_votes

In [None]:
year_2012 = df.loc[df["year"]==2012]

by_candidate_2012 = year_2012.groupby("candidate")
total_votes_2012 = by_candidate_2012["candidatevotes"].sum().sort_values(ascending=False)
total_votes_2012

# Adam code ends

# Ryan code

In [None]:
# Preparing the vote data of a given year.
csvfile = 'Resources/1976-2016-president.tab'
vote_df = pd.read_csv(csvfile, sep='\t', header=(0))
# Fixing the party label for 2012 minnesota democrat votes
vote_df = vote_df.replace('democratic-farmer-labor', 'democrat')

In [None]:
data_2016 = get_dataset(2016)
data_2012 = get_dataset(2012)

In [None]:
data_2016.head(3)

### training data 2016

In [None]:
X = data_2016[['Total_population', 'Income_median', 'Income_per_capita', 'Unemployment_rate',
               'Poverty_rate', 'Latitude', 'Longitude', 'median_male_age','median_female_age',
              'population_white_alone','population_black_alone', 'population_asian_alone',
               'population_hispanic_origin', 'population_american_indian_alone', 
               'median_house_construction_year']]
y = data_2016['winner']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 5)

* training decision_tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)
feature_importances = pd.DataFrame(decision_tree.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# y_predict_train = decision_tree.predict(X_train)
# y_predict_train
# cm = confusion_matrix(y_train, y_predict_train)
# sns.heatmap(cm, annot=True)

In [None]:
y_predict_test = decision_tree.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
# print(classification_report(y_test, y_predict_test))

### decision tree 2012 data
* X_2012, y_2012, y_predict_test_2012

In [None]:
X_2012 = data_2012[['Total_population', 'Income_median', 'Income_per_capita', 'Unemployment_rate',
               'Poverty_rate', 'Latitude', 'Longitude', 'median_male_age','median_female_age',
              'population_white_alone','population_black_alone', 'population_asian_alone',
               'population_hispanic_origin', 'population_american_indian_alone', 
               'median_house_construction_year']]
y_2012 = data_2012['winner']

In [None]:
y_predict_test_2012 = decision_tree.predict(X_2012)
# y_predict_test_2012
cm_2012 = confusion_matrix(y_2012, y_predict_test_2012)
sns.heatmap(cm_2012, annot=True)

### RandomForest classifier
* rf_model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

In [None]:
# y_predict_train = rf_model.predict(X_train)
# y_predict_train
# cm = confusion_matrix(y_train, y_predict_train)
# sns.heatmap(cm, annot=True)

In [None]:
y_predict_test = rf_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
y_predict_rf_2012 = rf_model.predict(X_2012)
cm = confusion_matrix(y_2012, y_predict_rf_2012)
sns.heatmap(cm, annot=True)

In [None]:
# data_2012['prediction'] = y_predict_rf_2012
# data_2012[['winner','prediction']]

### master_df

In [None]:
data_2012_2016 = pd.concat([data_2012.reset_index(), data_2016.reset_index()], ignore_index=True, axis=0)
data_2012_2016

In [None]:
X = data_2012_2016[['Total_population', 'Income_median', 'Income_per_capita', 'Unemployment_rate',
               'Poverty_rate', 'Latitude', 'Longitude', 'median_male_age','median_female_age',
              'population_white_alone','population_black_alone', 'population_asian_alone',
               'population_hispanic_origin', 'population_american_indian_alone', 
               'median_house_construction_year']]
y = data_2012_2016['winner']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 5)

* training decision_tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)
feature_importances = pd.DataFrame(decision_tree.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

from sklearn.metrics import classification_report, confusion_matrix

# y_predict_train = decision_tree.predict(X_train)
# y_predict_train
# cm = confusion_matrix(y_train, y_predict_train)
# sns.heatmap(cm, annot=True)

y_predict_test = decision_tree.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict_test))

* Training randomforest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model_master = RandomForestClassifier(n_estimators=100)
rf_model_master.fit(X_train, y_train)
y_predict_test_master = rf_model_master.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test_master)
sns.heatmap(cm, annot=True)

# Ryan code ends

# Connor code starts

In [None]:
print("I made a change too!")

# Connor code ends