In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

In [2]:
# load data 

# House election Harvard Dataverse https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2
district_results = pd.read_csv('1976-2018-house.csv', header=0,encoding = 'unicode_escape')
district_results['candidatevotes'] = district_results['candidatevotes'].str.replace(',', '')
district_results['candidatevotes'] = district_results['candidatevotes'].astype(int)

# 2016 Election Results https://transition.fec.gov/general/FederalElections2016.shtml
pres_gen_results_2016 = pd.read_csv('2016 Pres General Results-Table 1.csv', header=0,encoding = 'unicode_escape')
sen_results_2016 = pd.read_csv('2016 US Senate Results by State-Table 1.csv', header=0,encoding = 'unicode_escape')
house_results_2016 = pd.read_csv('2016 US House Results by State-Table 1.csv', header=0,encoding = 'unicode_escape')

#12/14 State Results
Election_2012 = district_results.loc[district_results['year'] == 2012]
Election_2014 = district_results.loc[district_results['year'] == 2014]
# 16/18 State Results 
Election_2016 = district_results.loc[district_results['year'] == 2016]
Election_2018 = district_results.loc[district_results['year'] == 2018]

# Sports Data 
sports_results = pd.read_csv('sports_data.csv')

states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE','DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

In [3]:
import glob

# Load Census District Data

filenames = glob.glob('District Data' + "/*.csv")


dfs = {}

filenames.sort()
i = 0 
for filename in filenames:
    dfs['state' + str(i)] = pd.read_csv(filename)
    i = i + 1
    

In [4]:
##### 2014 Set  (includes 2012,2014)
#Get winners for each district 
district_winners_2012 = pd.DataFrame()
for state in states : 
    temp = Election_2012.loc[Election_2012['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    while districts_number > 0 : 
        district = temp.loc[temp['district'] == districts_number]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2012 = district_winners_2012.append(winner)
        districts_number = districts_number - 1

#will be the 'result'
district_winners_2014 = pd.DataFrame()
for state in states : 
    temp = Election_2014.loc[Election_2014['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    while districts_number > 0 : 
        district = temp.loc[temp['district'] == districts_number]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2014 = district_winners_2014.append(winner)
        districts_number = districts_number - 1

###### 2018 Set (includes 2016,2018)
district_winners_2016 = pd.DataFrame()
for state in states : 
    temp = Election_2016.loc[Election_2016['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    while districts_number > 0 : 
        district = temp.loc[temp['district'] == districts_number]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2016 = district_winners_2016.append(winner)
        districts_number = districts_number - 1

district_winners_2018 = pd.DataFrame()
for state in states : 
    temp = Election_2018.loc[Election_2018['state_po'] == state]
    #number of districts
    districts_number = temp['district'].max()
    while districts_number > 0 : 
        district = temp.loc[temp['district'] == districts_number]
        winner = pd.DataFrame(district.loc[district['candidatevotes'] == district['candidatevotes'].max()])
        district_winners_2018 = district_winners_2018.append(winner)
        districts_number = districts_number - 1

In [5]:
# Classify based on party 

for district in district_winners_2012.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2012.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2012.loc[district[0],'class'] = 1
    else:
        district_winners_2012.loc[district[0],'class'] = 2
        
for district in district_winners_2014.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2014.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2014.loc[district[0],'class'] = 1
    else:
        district_winners_2014.loc[district[0],'class'] = 2

    
for district in district_winners_2016.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2016.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2016.loc[district[0],'class'] = 1
    else:
        district_winners_2016.loc[district[0],'class'] = 2

for district in district_winners_2018.iterrows():
    # May be able to use the vote count to assign the correct value back to the original dataframe, 
    # only two winners shared an exact vote count so only 1 collision must be handled
    if district[1]['party'] == 'republican' : 
        district_winners_2018.loc[district[0],'class'] = 0
    elif district[1]['party'] == 'democrat' :
        district_winners_2018.loc[district[0],'class'] = 1
    else:
        district_winners_2018.loc[district[0],'class'] = 2


In [6]:
# Mean statistic across all districts for each state  

percentage_below_poverty_line = []
percent_bachelor = []
percent_hs = []
for state in dfs:
    num_cols = len(dfs[state].columns)
    stats_pl = dfs[state][dfs[state]['Subject'] == 'Percentage of Families and People Whose Income in the Past 12 Months is Below the Poverty Level'].iloc[0:1,]
    stats_ba = dfs[state][dfs[state]['Title'] == "Percent bachelor's degree or higher"]
    stats_hs = dfs[state][dfs[state]['Title'] == "Percent high school graduate or higher"]
    pl_mean_sum = 0
    hs_mean_sum = 0
    ba_mean_sum = 0
    for col in range(num_cols):
        if col < 3:
            continue
        elif (col % 2) != 0:
            pl_mean_sum += float(stats_pl.iloc[:,col].values[0])
            hs_mean_sum += float(stats_hs.iloc[:,col].values[0])
            ba_mean_sum += float(stats_ba.iloc[:,col].values[0])
        else:
            continue
            
    pl_mean = pl_mean_sum / ((num_cols - 3)/2)
    percentage_below_poverty_line.append(pl_mean)
    
    hs_mean = hs_mean_sum / ((num_cols - 3)/2)
    
    ba_mean = ba_mean_sum / ((num_cols - 3)/2)
    
    percent_hs.append(hs_mean)
    percent_bachelor.append(ba_mean)
    
    

In [7]:
# Add average district data to sports dataframe 
sports_results['poverty'] = pd.Series(percentage_below_poverty_line)
sports_results['bachelors'] = pd.Series(percent_bachelor)
sports_results['hs'] = pd.Series(percent_hs)

NameError: name 'unemp_delta' is not defined

In [20]:
len(unemp_delta_df)

51

In [26]:

#Real GDP per capita by state by year (2018 set)
rGDP_pc_2015 = pd.read_csv('Real_GDP_pc/2015_rGDP_edit.csv')
rGDP_pc_2016 = pd.read_csv('Real_GDP_pc/2016_rGDP_edit.csv')
rGDP_pc_2017 = pd.read_csv('Real_GDP_pc/2017_rGDP_edit.csv')

#Real GDP per capita by state by year (2014 set)
rGDP_pc_2011 = pd.read_csv('Real_GDP_pc/2011_rGDP_edit.csv')
rGDP_pc_2012 = pd.read_csv('Real_GDP_pc/2012_rGDP_edit.csv')
rGDP_pc_2013 = pd.read_csv('Real_GDP_pc/2013_rGDP_edit.csv')


#Unemployment delta by state, 2016--2018
## US national delta = -1%
unemp_delta_df = pd.read_csv('2014_2018_unemp_oneyr_delta.csv')
rGDP_pc_2015.columns = ['state','2015_GDP']
economic_indicators = rGDP_pc_2015.copy()
economic_indicators['2016_GDP'] = rGDP_pc_2016['value']
economic_indicators['2017_GDP'] = rGDP_pc_2017['value']
economic_indicators['2011_GDP'] = rGDP_pc_2011['value']
economic_indicators['2012_GDP'] = rGDP_pc_2012['value']
economic_indicators['2013_GDP'] = rGDP_pc_2013['value']

economic_indicators['state'] = states
# economic_indicators = economic_indicators.join(unemp_delta_df, on = 'state')
economic_indicators = economic_indicators.join(unemp_delta_df.set_index('state'), on='state', how='inner')


economic_indicators['GDP_growth_2017'] = (economic_indicators['2017_GDP']-economic_indicators['2016_GDP'])/ economic_indicators['2017_GDP']
economic_indicators['GDP_growth_2013'] = (economic_indicators['2013_GDP']-economic_indicators['2012_GDP'])/ economic_indicators['2013_GDP']


In [25]:
economic_indicators

Unnamed: 0,state,2015_GDP,2016_GDP,2017_GDP,2011_GDP,2012_GDP,2013_GDP,2014_oneyr_unemp_delta,2018_oneyr_unemp_delta,GDP_growth_2017,GDP_growth_2013
0,AL,39014,39201,39594,38562,38687,38954,-0.4,-0.5,0.009926,0.006854
1,AK,72943,71086,71274,75683,78957,74283,-0.1,-0.4,0.002638,-0.062922
2,AZ,41008,41643,42476,40660,40919,40716,-0.9,-0.1,0.019611,-0.004986
3,AR,38229,38303,38411,36830,36836,37769,-1.2,0.0,0.002812,0.024703
4,CA,62347,63785,66262,55565,56492,58015,-1.4,-0.6,0.037382,0.026252
5,CO,56708,57166,58686,52459,52663,53607,-1.9,0.6,0.025901,0.01761
6,CT,67710,67845,68184,67452,67828,67062,-1.2,-0.6,0.004972,-0.011422
7,DE,69976,66485,65554,68353,67719,64048,-1.0,-0.7,-0.014202,-0.057316
8,DC,173621,174150,173944,180775,177615,173236,-0.7,-0.5,-0.001184,-0.025278
9,FL,41491,42013,42719,40001,39806,40080,-0.9,-0.6,0.016527,0.006836


In [12]:
# Classify based on presidential election results 

pres_by_state = pres_gen_results_2016[pres_gen_results_2016['WINNER INDICATOR']=='W']
for state in pres_by_state.iterrows():
    if state[1]['LAST NAME'] == 'Trump' : 
        pres_by_state.loc[state[0],'pres_class'] = 0.0
    else:
        pres_by_state.loc[state[0],'pres_class'] = 1.0
pres_by_state = pres_by_state[['STATE ABBREVIATION', 'pres_class']]

pres_by_state['State'] = pres_by_state['STATE ABBREVIATION']
pres_by_state = pres_by_state.drop('STATE ABBREVIATION', axis=1)
# pres_class tells us which party won that state in the 2016 presidential election

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [13]:
# Combine sports, incumbency, presidential results, economic data dataframes 

# Combine remaining incumbency data ***
    
    
district_winners_2018['incumbency'] = list(district_winners_2016['class'])
model_frame = district_winners_2018[['state_po','state_fips','district','incumbency','class']]
model_frame = model_frame.join(sports_results.set_index('State'), on='state_po', how='inner')
model_frame = model_frame.join(pres_by_state.set_index('State'), on='state_po', how='inner')
model_frame = model_frame.drop('state_fips', axis=1)
model_frame['state_district'] = model_frame.apply(lambda row: row.state_po + str(row.district), axis=1)
model_frame = model_frame.drop(['average','district'], axis=1)
model_frame = model_frame.join(economic_indicators.set_index('state'), on='state_po', how='inner')
model_frame['state_code'] = pd.Categorical(model_frame.state_po).codes

In [None]:
incumbent_wins = model_frame[model_frame['incumbency'] == model_frame['class']]
incumbent_loss = model_frame[model_frame['incumbency'] != model_frame['class']]

In [None]:
# boxplot for each feature 

fig, ax = plt.subplots(1,9, figsize=(20, 5))
ax[0].boxplot(model_frame['cfb'])
ax[1].boxplot(model_frame['nfl'])
ax[2].boxplot(model_frame['nba'])
ax[3].boxplot(model_frame['cbb'])
ax[4].boxplot(model_frame['poverty'])
ax[5].boxplot(model_frame['bachelors'])
ax[6].boxplot(model_frame['hs'])
ax[7].boxplot(model_frame['difference'])
ax[8].boxplot(model_frame['GDP_growth_2017'])

stats = ['College Football', 'NFL', 'NBA', 'College Baseball', 'poverty', 'bachelors', 'HS', 'difference', 'GDP Growth 2017']
for i in range(9):
    ax[i].set_title(stats[i])

In [None]:
# Scatter Matrix 

to_scatter = ['cfb', 'nfl', 'nba', 'cbb', 'poverty', 'bachelors', 'hs', 'difference', 'GDP_growth_2017']

df_to_scatter = model_frame.loc[:,to_scatter]

from pandas.plotting import scatter_matrix
scatter_matrix(df_to_scatter, alpha=0.8, figsize=(10, 10), diagonal='kde');

In [None]:
plt.hist(incumbent_wins['2017_GDP'], label='Incumbent won')
plt.hist(incumbent_loss['2017_GDP'], label= 'Incumbent lost')
plt.xlabel('GDP per capita')
plt.ylabel('Number Of Candidates')
plt.title('Incument wins and losses distributed by GDP per capit')
plt.legend()


In [None]:
plt.hist(model_frame[model_frame['class'] == 0]['poverty'], label = 'Republican')
plt.hist(model_frame[model_frame['class'] == 1]['poverty'], alpha = .4, label = 'Democrat')
plt.hist(model_frame[model_frame['class'] == 2]['poverty'], alpha = 1, label = 'Other')
plt.xlabel('Percentage of State Below Poverty Line')
plt.ylabel('Number Of Candidates')
plt.title('Poverty Rates split by Party Vote')
plt.legend()

In [None]:
plt.hist(model_frame[model_frame['class'] == 0]['bachelors'], label = 'Republican')
plt.hist(model_frame[model_frame['class'] == 1]['bachelors'], alpha = .4, label = 'Democrat')
plt.hist(model_frame[model_frame['class'] == 2]['bachelors'], alpha = 1, label = 'Other')
plt.xlabel('Percentage of residents with Bachelors Degree')
plt.ylabel('Number Of Candidates')
plt.title('Bachelors Degree Rates split by Party Vote')
plt.legend()

In [None]:
plt.hist(model_frame[model_frame['class'] == 0]['hs'], label = 'Republican')
plt.hist(model_frame[model_frame['class'] == 1]['hs'], alpha = .4, label = 'Democrat')
plt.hist(model_frame[model_frame['class'] == 2]['hs'], alpha = 1, label = 'Other')
plt.xlabel('Percentage of residents with Highschool Degree')
plt.ylabel('Number Of Candidates')
plt.title('Highschool Degree Rates split by Party Vote')
plt.legend()

In [None]:
model_frame_to_try = model_frame.drop(['state_po','state_district'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_frame_to_try.drop(['class'], axis=1), model_frame_to_try['class'], test_size=0.2, random_state=42)

In [None]:
=================

In [None]:
len(X_train.columns)

In [None]:
X_train_full = X_train.copy()
X_test_full = X_test.copy()
y_train_full = y_train.copy()
y_test_full = y_test.copy()
print(len(X_train_full.columns))

In [None]:
random_forest_full = RandomForestClassifier(max_features = int(np.sqrt(X_train_full.shape[1])), max_depth=5, n_estimators = 55)
random_forest_full.fit(X_train_full,y_train_full)

random_forest_train_predictions_full = random_forest_full.predict(X_train_full)
random_forest_test_predictions_full = random_forest_full.predict(X_test_full)

random_forest_train_score_full = accuracy_score(y_train, random_forest_train_predictions_full)
random_forest_test_score_full = accuracy_score(y_test, random_forest_test_predictions_full)

print("Random Forest Train Score: " + str(random_forest_train_score_full))
print("Random Forest Test Score: " + str(random_forest_test_score_full))

In [None]:
X_train_full.head()

In [None]:
==================

In [None]:
X_train = X_train[['incumbency','pres_class','poverty','bachelors','hs','2017_GDP','cfb','nfl','nba','cbb']] # with most standard predictors
X_test = X_test[['incumbency','pres_class', 'poverty','bachelors','hs','2017_GDP','cfb','nfl','nba','cbb']]

In [None]:
X_train.columns

In [None]:
random_forest_model = RandomForestClassifier(max_features = int(np.sqrt(X_train.shape[1])), max_depth=5, n_estimators = 55)
random_forest_model.fit(X_train, y_train)

random_forest_train_predictions = random_forest_model.predict(X_train)
random_forest_test_predictions = random_forest_model.predict(X_test)

random_forest_train_score = accuracy_score(y_train, random_forest_train_predictions)
random_forest_test_score = accuracy_score(y_test, random_forest_test_predictions)

print("Random Forest Train Score: " + str(random_forest_train_score))
print("Random Forest Test Score: " + str(random_forest_test_score))

In [None]:
X_train = X_train[['pres_class','poverty','bachelors','hs','2017_GDP']] # without district incumbency information
X_test = X_test[['pres_class', 'poverty','bachelors','hs','2017_GDP']]

In [None]:
random_forest_model = RandomForestClassifier(max_features = int(np.sqrt(X_train.shape[1])), max_depth=5, n_estimators = 55)
random_forest_model.fit(X_train, y_train)

random_forest_train_predictions = random_forest_model.predict(X_train)
random_forest_test_predictions = random_forest_model.predict(X_test)

random_forest_train_score = accuracy_score(y_train, random_forest_train_predictions)
random_forest_test_score = accuracy_score(y_test, random_forest_test_predictions)

print("Random Forest Train Score: " + str(random_forest_train_score))
print("Random Forest Test Score: " + str(random_forest_test_score))

In [None]:
X_train = X_train[['poverty','bachelors','hs','2017_GDP']] # without 2016 presidential election information
X_test = X_test[['poverty','bachelors','hs','2017_GDP']]

In [None]:
random_forest_model = RandomForestClassifier(max_features = int(np.sqrt(X_train.shape[1])), max_depth=5, n_estimators = 55)
random_forest_model.fit(X_train, y_train)

random_forest_train_predictions = random_forest_model.predict(X_train)
random_forest_test_predictions = random_forest_model.predict(X_test)

random_forest_train_score = accuracy_score(y_train, random_forest_train_predictions)
random_forest_test_score = accuracy_score(y_test, random_forest_test_predictions)

print("Random Forest Train Score: " + str(random_forest_train_score))
print("Random Forest Test Score: " + str(random_forest_test_score))