# Group X, Names, project

## Structure


    Imports
    Load Data
    
    Pre-process/ EDA 
        - comment why steps are happening in code
        - still needs to be refitted to new data --> District EDA (new data)
        - commented, prepped to be alley-hooped to the website writeup
        - visualiations

    Final dataframe, displayed
        -sub dataframes established for Model evaluation comparison
  
    Walkthrough of models attempted and showing of results exploration
        -score reporting  (can use acc., f1, confusion_matrix, important variables indicated if possible)
        -visualization comparing various model results
        -pick best/most mature/informed version of model 
         for analysis (best performing model)

    Final model that performed the best, thorough score analysis
        -clear metrics/explanation for why this approach is interesting/fruitful
        -visualization to include in conclusion
        -perhaps comparison to trivial model (vote with incumbent)
        -build to set up write-up conclusion

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
import statsmodels.api as sm
from statsmodels.api import OLS

import tensorflow as tf


## Load Data

In [2]:
#Load Data


#States

states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE','DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# House election Harvard Dataverse https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2
district_results = pd.read_csv('1976-2018-house.csv', header=0,encoding = 'unicode_escape')
district_results['candidatevotes'] = district_results['candidatevotes'].str.replace(',', '')
district_results['candidatevotes'] = district_results['candidatevotes'].astype(int)

# 2016 Election Results https://transition.fec.gov/general/FederalElections2016.shtml
pres_gen_results_2016 = pd.read_csv('2016 Pres General Results-Table 1.csv', header=0,encoding = 'unicode_escape')
sen_results_2016 = pd.read_csv('2016 US Senate Results by State-Table 1.csv', header=0,encoding = 'unicode_escape')
house_results_2016 = pd.read_csv('2016 US House Results by State-Table 1.csv', header=0,encoding = 'unicode_escape')

#12/14 State Results
Election_2012 = district_results.loc[district_results['year'] == 2012]
Election_2014 = district_results.loc[district_results['year'] == 2014]

#16/18 State Results 
Election_2016 = district_results.loc[district_results['year'] == 2016]
Election_2018 = district_results.loc[district_results['year'] == 2018]

#Sports Data 
sports_results_all = pd.read_csv('sports_data.csv')

#Economic Indicators

#Real GDP per capita by state by year (2018 set)
rGDP_pc_2015 = pd.read_csv('Real_GDP_pc/2015_rGDP_edit.csv')
rGDP_pc_2016 = pd.read_csv('Real_GDP_pc/2016_rGDP_edit.csv')
rGDP_pc_2017 = pd.read_csv('Real_GDP_pc/2017_rGDP_edit.csv')

#Real GDP per capita by state by year (2014 set)
rGDP_pc_2011 = pd.read_csv('Real_GDP_pc/2011_rGDP_edit.csv')
rGDP_pc_2012 = pd.read_csv('Real_GDP_pc/2012_rGDP_edit.csv')
rGDP_pc_2013 = pd.read_csv('Real_GDP_pc/2013_rGDP_edit.csv')


#Unemployment delta by state, 2016--2018
## US national delta = -1%
unemp_delta_df = pd.read_csv('2014_2018_unemp_oneyr_delta.csv')
rGDP_pc_2015.columns = ['state','2015_GDP']
economic_indicators = rGDP_pc_2015.copy()
economic_indicators['2016_GDP'] = rGDP_pc_2016['value']
economic_indicators['2017_GDP'] = rGDP_pc_2017['value']
economic_indicators['2011_GDP'] = rGDP_pc_2011['value']
economic_indicators['2012_GDP'] = rGDP_pc_2012['value']
economic_indicators['2013_GDP'] = rGDP_pc_2013['value']

economic_indicators['state'] = states
# economic_indicators = economic_indicators.join(unemp_delta_df, on = 'state')
economic_indicators = economic_indicators.join(unemp_delta_df.set_index('state'), on='state', how='inner')


economic_indicators['GDP_growth_2017'] = (economic_indicators['2017_GDP']-economic_indicators['2016_GDP'])/ economic_indicators['2017_GDP']
economic_indicators['GDP_growth_2013'] = (economic_indicators['2013_GDP']-economic_indicators['2012_GDP'])/ economic_indicators['2013_GDP']


##### District Data 2014

# district education
district_educ_14 = pd.read_csv('District_Education.csv', header=0,encoding = 'unicode_escape')
district_educ_14 = district_educ_14.drop([district_educ_14.index[87], district_educ_14.index[436]])
bach_14 = district_educ_14["Total; Estimate; Percent bachelor's degree or higher"]
hs_14 = district_educ_14["Total; Estimate; Percent high school graduate or higher"]
dist_names_14 = district_educ_14['Geography']

# district poverty
district_pov_14 = pd.read_csv('District Poverty.csv', header=0,encoding = 'unicode_escape')
district_pov_14 = district_pov_14.drop([district_pov_14.index[87], district_pov_14.index[436]])
pov_14 = district_pov_14['Percent below poverty level; Estimate; Population for whom poverty status is determined']

# district age gropus
district_age_14 = pd.read_csv('District Age Group.csv', header=0,encoding = 'unicode_escape')
district_age_14 = district_age_14.drop([district_age_14.index[87], district_age_14.index[436]])

## district income
district_income_14 = pd.read_csv('Household Income.csv', header=0,encoding = 'unicode_escape')
district_income_14 = district_income_14.drop([district_income_14.index[87], district_income_14.index[436]])
median_income_14 = district_income_14['Households; Estimate; Median income (dollars)'] 

###### District Data 2017 for 2018 dataframe
# district education

# extra things in district data, removing them 
district_educ_18 = pd.read_csv('Education 2017.csv', header=0,encoding = 'unicode_escape')
district_educ_18 = district_educ_18.drop([district_educ_18.index[87], district_educ_18.index[436]])
bach_18 = district_educ_18["Percent; Estimate; Percent bachelor's degree or higher"]
hs_18 = district_educ_18["Percent; Estimate; Percent high school graduate or higher"]
dist_names_18 = district_educ_18['Geography']

# district poverty
district_pov_18 = pd.read_csv('Poverty 2017.csv', header=0,encoding = 'unicode_escape')
district_pov_18 = district_pov_18.drop([district_pov_18.index[87], district_pov_18.index[436]])
pov_18 = district_pov_18['Percent below poverty level; Estimate; Population for whom poverty status is determined']

# district income
district_income_18 = pd.read_csv('Income 2017.csv', header=0,encoding = 'unicode_escape')
district_income_18 = district_income_18.drop([district_income_18.index[87], district_income_18.index[436]])
median_income_18 = district_income_18['Households; Estimate; Median income (dollars)'] 


In [3]:
import glob

# Load Census District Data

filenames = glob.glob('District Data' + "/*.csv")


dfs = {}

filenames.sort()
i = 0 
for filename in filenames:
    dfs['state' + str(i)] = pd.read_csv(filename)
    i = i + 1

## Process Data & Build Data Frames

In [4]:
agg_dist_14 = pd.DataFrame(dist_names_14)
agg_dist_14.insert(1, "Percent high school", hs_14)
agg_dist_14.insert(2, "Percent bachelor degrees", bach_14)
agg_dist_14.insert(3, "Below Poverty Line", pov_14)
agg_dist_14.insert(4, "Median Income", median_income_14)

agg_dist_18 = pd.DataFrame(dist_names_18)
agg_dist_18.insert(1, "Percent high school", hs_18)
agg_dist_18.insert(2, "Percent bachelor degrees", bach_18)
agg_dist_18.insert(3, "Below Poverty Line", pov_18)
agg_dist_18.insert(4, "Median Income", median_income_18)

In [5]:
agg_dist_18.head()

Unnamed: 0,Geography,Percent high school,Percent bachelor degrees,Below Poverty Line,Median Income
0,"Congressional District 1 (115th Congress), Ala...",87.0,24.0,18.0,46449
1,"Congressional District 2 (115th Congress), Ala...",84.1,22.1,18.8,44765
2,"Congressional District 3 (115th Congress), Ala...",84.0,21.5,18.7,44725
3,"Congressional District 4 (115th Congress), Ala...",80.9,16.9,18.5,41822
4,"Congressional District 5 (115th Congress), Ala...",87.1,30.8,15.0,52874


In [6]:
agg_dist_14.head()

Unnamed: 0,Geography,Percent high school,Percent bachelor degrees,Below Poverty Line,Median Income
0,"Congressional District 1 (114th Congress), Ala...",85.4,22.4,18.8,44302
1,"Congressional District 2 (114th Congress), Ala...",83.0,21.3,19.2,43205
2,"Congressional District 3 (114th Congress), Ala...",82.0,20.1,21.0,41037
3,"Congressional District 4 (114th Congress), Ala...",79.2,15.5,19.2,39316
4,"Congressional District 5 (114th Congress), Ala...",85.7,29.2,14.9,50264


In [7]:
##### 2014 Set  (includes 2012,2014)
#Get winners for each district 
district_winners_2012 = pd.DataFrame()
for state in states : 
    temp = Election_2012.loc[Election_2012['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    counter = 0
    while counter <= districts_number : 
        district = temp.loc[temp['district'] == counter]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2012 = district_winners_2012.append(winner)
        counter = counter + 1 

#will be the 'result'
#  Dataframe to add district predictors 
district_winners_2014 = pd.DataFrame()
for state in states : 
    temp = Election_2014.loc[Election_2014['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    counter = 0
    while counter <= districts_number : 
        district = temp.loc[temp['district'] == counter]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2014 = district_winners_2014.append(winner)
        counter = counter + 1 
        

###### 2018 Set (includes 2016,2018)
district_winners_2016 = pd.DataFrame()
for state in states : 
    temp = Election_2016.loc[Election_2016['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    counter = 0
    while counter <= districts_number : 
        district = temp.loc[temp['district'] == counter]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2016 = district_winners_2016.append(winner)
        counter = counter + 1 


district_winners_2018 = pd.DataFrame()
for state in states : 
    temp = Election_2018.loc[Election_2018['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    counter = 0
    while counter <= districts_number : 
        district = temp.loc[temp['district'] == counter]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2018 = district_winners_2018.append(winner)
        counter = counter + 1 
        

In [8]:
len(district_winners_2018)

435

In [9]:
len(district_winners_2018.state_po.value_counts())

50

In [10]:
# Classify based on party 

for district in district_winners_2012.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2012.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2012.loc[district[0],'class'] = 1
    else:
        district_winners_2012.loc[district[0],'class'] = 2
        
for district in district_winners_2014.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2014.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2014.loc[district[0],'class'] = 1
    else:
        district_winners_2014.loc[district[0],'class'] = 2
    
for district in district_winners_2016.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2016.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2016.loc[district[0],'class'] = 1
    else:
        district_winners_2016.loc[district[0],'class'] = 2

for district in district_winners_2018.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2018.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2018.loc[district[0],'class'] = 1
    else:
        district_winners_2018.loc[district[0],'class'] = 2

In [11]:
# Mean statistic across all districts for each state  
# Handled differently because data format likely ----
# this Takes district data for Poverty, Bachelors, HSdegree and averages
# it to produce a single value for a State-level dataframe

percentage_below_poverty_line = []
percent_bachelor = []
percent_hs = []
for state in dfs:
    num_cols = len(dfs[state].columns)
    stats_pl = dfs[state][dfs[state]['Subject'] == 'Percentage of Families and People Whose Income in the Past 12 Months is Below the Poverty Level'].iloc[0:1,]
    stats_ba = dfs[state][dfs[state]['Title'] == "Percent bachelor's degree or higher"]
    stats_hs = dfs[state][dfs[state]['Title'] == "Percent high school graduate or higher"]
    pl_mean_sum = 0
    hs_mean_sum = 0
    ba_mean_sum = 0
    for col in range(num_cols):
        if col < 3:
            continue
        elif (col % 2) != 0:
            pl_mean_sum += float(stats_pl.iloc[:,col].values[0])
            hs_mean_sum += float(stats_hs.iloc[:,col].values[0])
            ba_mean_sum += float(stats_ba.iloc[:,col].values[0])
        else:
            continue
            
    pl_mean = pl_mean_sum / ((num_cols - 3)/2)
    percentage_below_poverty_line.append(pl_mean)
    
    hs_mean = hs_mean_sum / ((num_cols - 3)/2)
    
    ba_mean = ba_mean_sum / ((num_cols - 3)/2)
    
    percent_hs.append(hs_mean)
    percent_bachelor.append(ba_mean)

In [12]:
# State Data for 2014

#education data
district_educ_14_st = pd.read_csv('district14/Education Attainment edited.csv', header=0,encoding = 'unicode_escape')
bach_14_st = district_educ_14_st["Total; Estimate; Percent bachelor's degree or higher"]
hs_14_st = district_educ_14_st["Total; Estimate; Percent high school graduate or higher"]
state_names_14_st = district_educ_14_st['Geography']

# poverty data
pov_14_st = pd.read_csv('district14/District Poverty.csv')
poverty_14_st = pov_14_st['Percent below poverty level; Estimate; Population for whom poverty status is determined']

dist14 = pd.DataFrame(state_names_14_st)
dist14.insert(1, "hs14", hs_14_st/100)
dist14.insert(2, "bachelors14", bach_14_st/100)
dist14.insert(3, "poverty14", poverty_14_st/100)
dist14 = dist14[:51]
dist14.insert(4, 'state', states)


In [13]:
#sports results already had 14,18 full athletic records
# This block adds 2018 State Data

sports_results_all['hs18'] = pd.Series(percent_hs)/100
sports_results_all['bachelors18'] = pd.Series(percent_bachelor)/100
sports_results_all['poverty18'] = pd.Series(percentage_below_poverty_line)/100

In [14]:
sports_results_all = sports_results_all.join(dist14.set_index('state'), on='State').drop('Geography', axis=1)

In [15]:
sports_results_all.shape

(51, 15)

In [16]:
economic_indicators18 = economic_indicators[['state', '2015_GDP', '2016_GDP', '2017_GDP','2018_oneyr_unemp_delta', 'GDP_growth_2017']]
economic_indicators14 = economic_indicators[['state', '2011_GDP', '2012_GDP', '2013_GDP','2014_oneyr_unemp_delta', 'GDP_growth_2013']]
sports_results18 = sports_results_all[['State','cfb 18', 'nfl 18', 'nba 18', 'cbb 18', 'poverty18', 'bachelors18', 'hs18']]
sports_results14 = sports_results_all[['State','cfb 14', 'nfl 14', 'nba 14', 'cbb 14', 'poverty14', 'bachelors14', 'hs14']]

In [17]:
economic_indicators18.head()

Unnamed: 0,state,2015_GDP,2016_GDP,2017_GDP,2018_oneyr_unemp_delta,GDP_growth_2017
0,AL,39014,39201,39594,-0.5,0.009926
1,AK,72943,71086,71274,-0.4,0.002638
2,AZ,41008,41643,42476,-0.1,0.019611
3,AR,38229,38303,38411,0.0,0.002812
4,CA,62347,63785,66262,-0.6,0.037382


In [18]:
economic_indicators14.head()

Unnamed: 0,state,2011_GDP,2012_GDP,2013_GDP,2014_oneyr_unemp_delta,GDP_growth_2013
0,AL,38562,38687,38954,-0.4,0.006854
1,AK,75683,78957,74283,-0.1,-0.062922
2,AZ,40660,40919,40716,-0.9,-0.004986
3,AR,36830,36836,37769,-1.2,0.024703
4,CA,55565,56492,58015,-1.4,0.026252


In [19]:
sports_results18.head()

Unnamed: 0,State,cfb 18,nfl 18,nba 18,cbb 18,poverty18,bachelors18,hs18
0,AK,0.565,0.846,0.72,0.517,0.075,0.302,0.933
1,AL,0.933,0.813,0.293,0.556,0.124714,0.253,0.864857
2,AR,0.142,0.625,0.268,0.657,0.12775,0.231,0.87175
3,AZ,0.666,0.5,0.256,0.771,0.102444,0.294,0.871111
4,CA,0.636,0.813,0.707,0.636,0.093038,0.33383,0.833132


In [20]:
sports_results14.head()

Unnamed: 0,State,cfb 14,nfl 14,nba 14,cbb 14,poverty14,bachelors14,hs14
0,AK,0.542,0.75,0.622,0.552,0.101,0.28,0.929
1,AL,0.857,0.375,0.732,0.558,0.189,0.235,0.847
2,AR,0.538,0.75,0.671,0.75,0.192,0.214,0.853
3,AZ,0.714,0.688,0.476,0.894,0.182,0.276,0.861
4,CA,0.769,0.5,0.683,0.611,0.164,0.317,0.821


In [21]:
#Classify based on presidential election results

pres_by_state = pres_gen_results_2016[pres_gen_results_2016['WINNER INDICATOR']=='W']
for state in pres_by_state.iterrows():
    if state[1]['LAST NAME'] == 'Trump' : 
        pres_by_state.loc[state[0],'pres_class'] = 0.0
    else:
        pres_by_state.loc[state[0],'pres_class'] = 1.0
pres_by_state = pres_by_state[['STATE ABBREVIATION', 'pres_class']]

pres_by_state['State'] = pres_by_state['STATE ABBREVIATION']
pres_by_state = pres_by_state.drop('STATE ABBREVIATION', axis=1)
# pres_class tells us which party won that state in the 2016 presidential election

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
# Combine sports, incumbency, presidential results, economic data dataframes 
def same(row):
    if row.pres == row.incumbency:
        return 1
    else:
        return 0
    
def incwin(row):
    if row['class'] == row.incumbency:
        return 1
    else:
        return 0
# Combine remaining incumbency data ***

In [23]:
district_winners_2018['incumbency'] = list(district_winners_2016['class'])
model_frame18_st = district_winners_2018[['state_po','state_fips','district','incumbency','class']]
model_frame18_st = model_frame18_st.join(sports_results18.set_index('State'), on='state_po', how='inner')
model_frame18_st = model_frame18_st.join(pres_by_state.set_index('State'), on='state_po', how='inner')
model_frame18_st = model_frame18_st.drop('state_fips', axis=1)
model_frame18_st['state_district'] = model_frame18_st.apply(lambda row: row.state_po + str(row.district), axis=1)
#model_frame = model_frame.drop(['average','district'], axis=1)
model_frame18_st = model_frame18_st.join(economic_indicators18.set_index('state'), on='state_po', how='inner')
#model_frame18['state_code'] = pd.Categorical(model_frame18.state_po).codes
model_frame18_st['pres'] = 0
model_frame18_st['same'] = model_frame18_st.apply(lambda row: same(row), axis=1)
model_frame18_st['inc_win'] = model_frame18_st.apply(lambda row: incwin(row), axis=1)

In [24]:
model_frame18_st.shape

(435, 21)

In [26]:
district_winners_2014['incumbency'] = list(district_winners_2012['class'])
model_frame14_st = district_winners_2014[['state_po','state_fips','district','incumbency','class']]
model_frame14_st = model_frame14_st.join(sports_results14.set_index('State'), on='state_po', how='inner')
model_frame14_st = model_frame14_st.join(pres_by_state.set_index('State'), on='state_po', how='inner')
model_frame14_st = model_frame14_st.drop('state_fips', axis=1)
model_frame14_st['state_district'] = model_frame14_st.apply(lambda row: row.state_po + str(row.district), axis=1)
#model_frame = model_frame.drop(['average','district'], axis=1)
model_frame14_st = model_frame14_st.join(economic_indicators14.set_index('state'), on='state_po', how='inner')
#model_frame14['state_code'] = pd.Categorical(model_frame14.state_po).codes
model_frame14_st['pres'] = 1

model_frame14_st['same'] = model_frame14_st.apply(lambda row: same(row), axis=1)
model_frame14_st['inc_win'] = model_frame14_st.apply(lambda row: incwin(row), axis=1)

In [27]:
model_frame14_st.shape

(435, 21)

In [28]:
model_frame14_st.columns

Index(['state_po', 'district', 'incumbency', 'class', 'cfb 14', 'nfl 14',
       'nba 14', 'cbb 14', 'poverty14', 'bachelors14', 'hs14', 'pres_class',
       'state_district', '2011_GDP', '2012_GDP', '2013_GDP',
       '2014_oneyr_unemp_delta', 'GDP_growth_2013', 'pres', 'same', 'inc_win'],
      dtype='object')

In [29]:
# currently has averaged data for pov, bachelors, HS
model_frame18_st.columns

Index(['state_po', 'district', 'incumbency', 'class', 'cfb 18', 'nfl 18',
       'nba 18', 'cbb 18', 'poverty18', 'bachelors18', 'hs18', 'pres_class',
       'state_district', '2015_GDP', '2016_GDP', '2017_GDP',
       '2018_oneyr_unemp_delta', 'GDP_growth_2017', 'pres', 'same', 'inc_win'],
      dtype='object')

In [36]:
model_frame14_dist = model_frame14_st.copy()
model_frame14_dist['poverty14'] = agg_dist_14['Below Poverty Line'].values
model_frame14_dist['bachelors14'] = agg_dist_14['Percent bachelor degrees'].values
model_frame14_dist['hs14'] = agg_dist_14['Percent high school'].values
model_frame14_dist['med_inc14'] = agg_dist_14['Median Income'].values

In [37]:
model_frame18_dist = model_frame18_st.copy()
model_frame18_dist['poverty18'] = agg_dist_18['Below Poverty Line'].values
model_frame18_dist['bachelors18'] = agg_dist_18['Percent bachelor degrees'].values
model_frame18_dist['hs18'] = agg_dist_18['Percent high school'].values
model_frame18_dist['med_inc18'] = agg_dist_18['Median Income'].values

In [38]:
print(model_frame14_dist.shape)
print(model_frame18_dist.shape)

(435, 22)
(435, 22)


# Final Dataframes, difference is district level data is averaged in _st, it is left at district level granularity in _dist

In [39]:
print(model_frame14_dist.shape)
model_frame14_dist.head()

(435, 22)


Unnamed: 0,state_po,district,incumbency,class,cfb 14,nfl 14,nba 14,cbb 14,poverty14,bachelors14,...,state_district,2011_GDP,2012_GDP,2013_GDP,2014_oneyr_unemp_delta,GDP_growth_2013,pres,same,inc_win,med_inc14
25480,AL,1,0.0,0.0,0.857,0.375,0.732,0.558,18.8,22.4,...,AL1,38562,38687,38954,-0.4,0.006854,1,0,1,44302
25483,AL,2,0.0,0.0,0.857,0.375,0.732,0.558,19.2,21.3,...,AL2,38562,38687,38954,-0.4,0.006854,1,0,1,43205
25488,AL,3,0.0,0.0,0.857,0.375,0.732,0.558,21.0,20.1,...,AL3,38562,38687,38954,-0.4,0.006854,1,0,1,41037
25489,AL,4,0.0,0.0,0.857,0.375,0.732,0.558,19.2,15.5,...,AL4,38562,38687,38954,-0.4,0.006854,1,0,1,39316
25493,AL,5,0.0,0.0,0.857,0.375,0.732,0.558,14.9,29.2,...,AL5,38562,38687,38954,-0.4,0.006854,1,0,1,50264


In [40]:
print(model_frame18_dist.shape)
model_frame18_dist.head()

(435, 22)


Unnamed: 0,state_po,district,incumbency,class,cfb 18,nfl 18,nba 18,cbb 18,poverty18,bachelors18,...,state_district,2015_GDP,2016_GDP,2017_GDP,2018_oneyr_unemp_delta,GDP_growth_2017,pres,same,inc_win,med_inc18
28278,AL,1,0.0,0.0,0.933,0.813,0.293,0.556,18.0,24.0,...,AL1,39014,39201,39594,-0.5,0.009926,0,1,1,46449
28280,AL,2,0.0,0.0,0.933,0.813,0.293,0.556,18.8,22.1,...,AL2,39014,39201,39594,-0.5,0.009926,0,1,1,44765
28283,AL,3,0.0,0.0,0.933,0.813,0.293,0.556,18.7,21.5,...,AL3,39014,39201,39594,-0.5,0.009926,0,1,1,44725
28286,AL,4,0.0,0.0,0.933,0.813,0.293,0.556,18.5,16.9,...,AL4,39014,39201,39594,-0.5,0.009926,0,1,1,41822
28289,AL,5,0.0,0.0,0.933,0.813,0.293,0.556,15.0,30.8,...,AL5,39014,39201,39594,-0.5,0.009926,0,1,1,52874


In [41]:
print(model_frame14_st.shape)
model_frame14_st.head()

(435, 21)


Unnamed: 0,state_po,district,incumbency,class,cfb 14,nfl 14,nba 14,cbb 14,poverty14,bachelors14,...,pres_class,state_district,2011_GDP,2012_GDP,2013_GDP,2014_oneyr_unemp_delta,GDP_growth_2013,pres,same,inc_win
25480,AL,1,0.0,0.0,0.857,0.375,0.732,0.558,0.189,0.235,...,0.0,AL1,38562,38687,38954,-0.4,0.006854,1,0,1
25483,AL,2,0.0,0.0,0.857,0.375,0.732,0.558,0.189,0.235,...,0.0,AL2,38562,38687,38954,-0.4,0.006854,1,0,1
25488,AL,3,0.0,0.0,0.857,0.375,0.732,0.558,0.189,0.235,...,0.0,AL3,38562,38687,38954,-0.4,0.006854,1,0,1
25489,AL,4,0.0,0.0,0.857,0.375,0.732,0.558,0.189,0.235,...,0.0,AL4,38562,38687,38954,-0.4,0.006854,1,0,1
25493,AL,5,0.0,0.0,0.857,0.375,0.732,0.558,0.189,0.235,...,0.0,AL5,38562,38687,38954,-0.4,0.006854,1,0,1


In [42]:
print(model_frame18_st.shape)
model_frame18_st.head()

(435, 21)


Unnamed: 0,state_po,district,incumbency,class,cfb 18,nfl 18,nba 18,cbb 18,poverty18,bachelors18,...,pres_class,state_district,2015_GDP,2016_GDP,2017_GDP,2018_oneyr_unemp_delta,GDP_growth_2017,pres,same,inc_win
28278,AL,1,0.0,0.0,0.933,0.813,0.293,0.556,0.124714,0.253,...,0.0,AL1,39014,39201,39594,-0.5,0.009926,0,1,1
28280,AL,2,0.0,0.0,0.933,0.813,0.293,0.556,0.124714,0.253,...,0.0,AL2,39014,39201,39594,-0.5,0.009926,0,1,1
28283,AL,3,0.0,0.0,0.933,0.813,0.293,0.556,0.124714,0.253,...,0.0,AL3,39014,39201,39594,-0.5,0.009926,0,1,1
28286,AL,4,0.0,0.0,0.933,0.813,0.293,0.556,0.124714,0.253,...,0.0,AL4,39014,39201,39594,-0.5,0.009926,0,1,1
28289,AL,5,0.0,0.0,0.933,0.813,0.293,0.556,0.124714,0.253,...,0.0,AL5,39014,39201,39594,-0.5,0.009926,0,1,1


# Still need preprocessing  -- did we try regularization, standardization, normalization?....what do those words mean can we explain why we chose to do or not to do a given pre-processing step

# Also -- must annotate the actual data cleaning (e.g. deleting puerto rico data, what data was stretched from state to every district)