In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
# Reading the csv
elections_df = pd.read_csv("elections_2018_all.csv")
elections_df.head()

Unnamed: 0,year,case_id,weight,weight_cumulative,state,st,cong,cong_up,state_post,st_post,...,voted_gov_chosen,rep_current,rep_icpsr,sen1_current,sen1_icpsr,sen2_current,sen2_icpsr,gov_current,candidate_id,ideoId
0,2018,410751329,0.808436,0.441945,Texas,TX,115,116,,,...,,Gene Green (D),39304.0,John Cornyn (R),40305.0,Ted Cruz (R),41304.0,Greg Abbott (R),1,4
1,2018,410766300,1.757688,0.960869,Ohio,OH,115,116,Ohio,OH,...,Mike DeWine (R),Steve Chabot (R),29550.0,Rob Portman (R),29386.0,Sherrod Brown (D),29389.0,John Kasich (R),2,1
2,2018,410770169,1.024198,0.559895,Kentucky,KY,115,116,Kentucky,KY,...,,John Yarmuth (D),20723.0,Mitch McConnell (R),14921.0,Rand Paul (R),41104.0,Matt Bevin (R),1,2
3,2018,410770285,0.461958,0.252537,Arizona,AZ,115,116,Arizona,AZ,...,Doug Ducey (R),Kyrsten Sinema (D),21300.0,Jeff Flake (R),20100.0,Jon Kyl (R),15429.0,Doug Ducey (R),2,3
4,2018,410099450,0.275367,0.150534,Pennsylvania,PA,115,116,Pennsylvania,PA,...,,Mike Kelly (R),21167.0,"Bob Casey, Jr. (D)",40703.0,Pat Toomey (R),29935.0,Tom Wolf (D),2,2


In [4]:
# Selecting the apporpriate columns
df = elections_df[['st', 'gender', 'age', 'educ', 'race', 'faminc', 'employ', 'ownhome', 'marstat', 'newsint', 'approval_pres', 'ideo5', 'voted_pres_16']]

In [5]:
df = pd.DataFrame(df)
df.head()

Unnamed: 0,st,gender,age,educ,race,faminc,employ,ownhome,marstat,newsint,approval_pres,ideo5,voted_pres_16
0,TX,2,45,3,2,10k - 20k,Unemployed,Rent,2.0,7.0,3.0,Not Sure,Hilary Clinton
1,OH,2,58,6,1,150k+,Full-Time,Own,1.0,3.0,2.0,Conservative,Donald Trump
2,KY,2,66,6,1,10k - 20k,Retired,Rent,3.0,1.0,4.0,Liberal,Hilary Clinton
3,AZ,2,88,2,1,20k - 30k,Retired,Rent,4.0,2.0,2.0,Moderate,Donald Trump
4,PA,2,59,6,1,10k - 20k,Permanently Disabled,Rent,3.0,1.0,3.0,Liberal,Donald Trump


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   st             60000 non-null  object 
 1   gender         60000 non-null  int64  
 2   age            60000 non-null  int64  
 3   educ           60000 non-null  int64  
 4   race           60000 non-null  int64  
 5   faminc         59661 non-null  object 
 6   employ         59959 non-null  object 
 7   ownhome        59794 non-null  object 
 8   marstat        59978 non-null  float64
 9   newsint        59615 non-null  float64
 10  approval_pres  59967 non-null  float64
 11  ideo5          59581 non-null  object 
 12  voted_pres_16  46477 non-null  object 
dtypes: float64(3), int64(4), object(6)
memory usage: 6.0+ MB
None


In [7]:
# See how many null values there are
df.isnull().values.sum()

14968

In [8]:
# Drop rows with null values
df = df.dropna()
df.isnull().values.sum()

0

In [9]:
# Change the dtype of of columns with float dtype to int64
df['marstat'] = df['marstat'].astype(int)
df['newsint'] = df['newsint'].astype(int)
df['approval_pres'] = df['approval_pres'].astype(int)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46043 entries, 0 to 59999
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   st             46043 non-null  object
 1   gender         46043 non-null  int64 
 2   age            46043 non-null  int64 
 3   educ           46043 non-null  int64 
 4   race           46043 non-null  int64 
 5   faminc         46043 non-null  object
 6   employ         46043 non-null  object
 7   ownhome        46043 non-null  object
 8   marstat        46043 non-null  int64 
 9   newsint        46043 non-null  int64 
 10  approval_pres  46043 non-null  int64 
 11  ideo5          46043 non-null  object
 12  voted_pres_16  46043 non-null  object
dtypes: int64(7), object(6)
memory usage: 4.9+ MB
None


In [10]:
# Binning numerical column 'age'
df['Cat_age'] = pd.qcut(df.age, q = 4, labels = False)
df.head()

Unnamed: 0,st,gender,age,educ,race,faminc,employ,ownhome,marstat,newsint,approval_pres,ideo5,voted_pres_16,Cat_age
0,TX,2,45,3,2,10k - 20k,Unemployed,Rent,2,7,3,Not Sure,Hilary Clinton,1
1,OH,2,58,6,1,150k+,Full-Time,Own,1,3,2,Conservative,Donald Trump,2
2,KY,2,66,6,1,10k - 20k,Retired,Rent,3,1,4,Liberal,Hilary Clinton,3
3,AZ,2,88,2,1,20k - 30k,Retired,Rent,4,2,2,Moderate,Donald Trump,3
4,PA,2,59,6,1,10k - 20k,Permanently Disabled,Rent,3,1,3,Liberal,Donald Trump,2


In [11]:
# Drop the original age column
df = df.drop(['age'], axis = 1)
df.head()

Unnamed: 0,st,gender,educ,race,faminc,employ,ownhome,marstat,newsint,approval_pres,ideo5,voted_pres_16,Cat_age
0,TX,2,3,2,10k - 20k,Unemployed,Rent,2,7,3,Not Sure,Hilary Clinton,1
1,OH,2,6,1,150k+,Full-Time,Own,1,3,2,Conservative,Donald Trump,2
2,KY,2,6,1,10k - 20k,Retired,Rent,3,1,4,Liberal,Hilary Clinton,3
3,AZ,2,2,1,20k - 30k,Retired,Rent,4,2,2,Moderate,Donald Trump,3
4,PA,2,6,1,10k - 20k,Permanently Disabled,Rent,3,1,3,Liberal,Donald Trump,2


In [12]:
# Count the number of unique values in each column
df.nunique()

st               51
gender            2
educ              6
race              8
faminc           13
employ            9
ownhome           3
marstat           6
newsint           5
approval_pres     5
ideo5             6
voted_pres_16     5
Cat_age           4
dtype: int64

In [13]:
# Finding the top categories of 'marsinc'
df.faminc.value_counts().sort_values(ascending = False).head(20)

30k - 40k            4587
Prefer not to say    4501
80k - 100k           4339
50k - 60k            4151
40k - 50k            4080
20k - 30k            3907
70k - 80k            3585
150k+                3377
60k - 70k            3316
100k - 120k          3178
120k - 150k          2832
10k - 20k            2809
Less than 10k        1381
Name: faminc, dtype: int64

In [14]:
# Create new DataFrame with the categorical columns
cat_df = df.select_dtypes(include = ['object']).copy()
cat_df.head()

Unnamed: 0,st,faminc,employ,ownhome,ideo5,voted_pres_16
0,TX,10k - 20k,Unemployed,Rent,Not Sure,Hilary Clinton
1,OH,150k+,Full-Time,Own,Conservative,Donald Trump
2,KY,10k - 20k,Retired,Rent,Liberal,Hilary Clinton
3,AZ,20k - 30k,Retired,Rent,Moderate,Donald Trump
4,PA,10k - 20k,Permanently Disabled,Rent,Liberal,Donald Trump


In [19]:
cat_dummy_df = pd.get_dummies(cat_df, columns = ['st', 'faminc', 'employ', 'ownhome', 'ideo5', 'voted_pres_16']).copy()

In [20]:
cat_dummy_df.head()

Unnamed: 0,st_AK,st_AL,st_AR,st_AZ,st_CA,st_CO,st_CT,st_DC,st_DE,st_FL,...,ideo5_Liberal,ideo5_Moderate,ideo5_Not Sure,ideo5_Very Conservative,ideo5_Very Liberal,voted_pres_16_Did Not Vote,voted_pres_16_Donald Trump,voted_pres_16_Hilary Clinton,voted_pres_16_Not Sure / Don't Recall,voted_pres_16_Other / Someone Else
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [21]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cat_dummy_df))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(cat_dummy_df)
encode_df.head()

ValueError: input_features should have length equal to number of features (87), got 46043