<a href="https://colab.research.google.com/github/Catherine-Nguyen88/project_voting/blob/main/merging_and_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script for merging and data cleaning

In [48]:
# clone from repo
! git clone https://github.com/Catherine-Nguyen88/project_voting

fatal: destination path 'project_voting' already exists and is not an empty directory.


In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.linear_model import LinearRegression

## Merging for estimates

In [50]:
voting_original.columns

Index(['Unnamed: 0', 'year', 'state', 'state_po', 'county_name', 'county_fips',
       'office', 'candidate', 'party', 'candidatevotes', 'totalvotes',
       'version', 'mode'],
      dtype='object')

In [168]:
voting_original = pd.read_csv('./project_voting/data/voting_VA.csv')
voting_original.head(5)

# modify voting CSV
def match_county_name(county_name):
    county_name = county_name.title()

    if county_name.endswith("City"):
        county_name = county_name[:-4] + "city"
    else:
        county_name += " County"

    return county_name

voting_df = voting_original.copy()
voting_df['county_name'] = voting_df['county_name'].apply(lambda x: match_county_name(x))
voting_df.head()

# now, for each county, get candidatevotes/totalvotes
voting_df['fractionalvotes'] = voting_df['candidatevotes']/voting_df['totalvotes']

# split voting data into separate years for merging
voting_2000 = voting_df[voting_df['year']==2000]
print(f'Years for voting_2000 {voting_2000["year"].unique()}')
voting_2004 = voting_df[voting_df['year']==2004]
print(f'Years for voting_2004 {voting_2004["year"].unique()}')
voting_2008 = voting_df[voting_df['year']==2008]
print(f'Years for voting_2008 {voting_2008["year"].unique()}')
voting_2012 = voting_df[voting_df['year']==2012]
print(f'Years for voting_2012 {voting_2012["year"].unique()}')
voting_2016 = voting_df[voting_df['year']==2016]
print(f'Years for voting_2016 {voting_2016["year"].unique()}')
voting_2020 = voting_df[voting_df['year']==2020]
print(f'Years for voting_2020 {voting_2020["year"].unique()}')

# looks correct

Years for voting_2000 [2000]
Years for voting_2004 [2004]
Years for voting_2008 [2008]
Years for voting_2012 [2012]
Years for voting_2016 [2016]
Years for voting_2020 [2020]


#### 2000 election

In [52]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds176_20105_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})

# merge the datasets
merged_2000 = voting_2000.merge(dem_VA, on='county_name', how='left')
merged_2000.head(5)

(0, 276)


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,JSDE003,JSDE004,JSDE005,JSDE006,JSDE007,JSDE008,JSDE009,JSDE010,JS5E001,JTIE001
0,11161,2000,VIRGINIA,VA,Accomack,51001,US PRESIDENT,AL GORE,DEMOCRAT,5092,...,,,,,,,,,,
1,11162,2000,VIRGINIA,VA,Accomack,51001,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,6352,...,,,,,,,,,,
2,11163,2000,VIRGINIA,VA,Accomack,51001,US PRESIDENT,RALPH NADER,GREEN,220,...,,,,,,,,,,
3,11164,2000,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,261,...,,,,,,,,,,
4,11165,2000,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,AL GORE,DEMOCRAT,16255,...,,,,,,,,,,


#### 2004 election

In [53]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds176_20105_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})

# merge the datasets
merged_2004 = voting_2004.merge(dem_VA, on='county_name', how='left')
merged_2004.head(5)

(0, 276)


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,JSDE003,JSDE004,JSDE005,JSDE006,JSDE007,JSDE008,JSDE009,JSDE010,JS5E001,JTIE001
0,20838,2004,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JOHN KERRY,DEMOCRAT,5518,...,,,,,,,,,,
1,20839,2004,VIRGINIA,VA,Accomack,51001,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,7726,...,,,,,,,,,,
2,20840,2004,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,112,...,,,,,,,,,,
3,20841,2004,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,JOHN KERRY,DEMOCRAT,22088,...,,,,,,,,,,
4,20842,2004,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,21189,...,,,,,,,,,,


#### 2008 election

In [54]:
print(dem_VA.columns)

Index(['Unnamed: 0', 'X.2', 'X.1', 'X', 'GISJOIN', 'YEAR', 'STUSAB', 'REGIONA',
       'DIVISIONA', 'STATE',
       ...
       'JSDE003', 'JSDE004', 'JSDE005', 'JSDE006', 'JSDE007', 'JSDE008',
       'JSDE009', 'JSDE010', 'JS5E001', 'JTIE001'],
      dtype='object', length=276)


In [55]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds191_20125_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
# dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})

def match_county_name(county_name):
  return county_name.replace(" County", "").replace(" city", "")

dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.head()

# merge the datasets
merged_2008 = voting_2008.merge(dem_VA, on='county_name', how='left')
merged_2008

(134, 191)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,QXSE007,QX6E001,QX7E001,QX7E002,QX7E003,QX8E001,QX8E002,QX8E003,QZTE001,QZ6E001
0,30189,2008,VIRGINIA,VA,Accomack,51001,US PRESIDENT,BARACK OBAMA,DEMOCRAT,7607,...,10973,21017,21017,14286,6731,14286,10070,4216,741,153800
1,30190,2008,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,7833,...,10973,21017,21017,14286,6731,14286,10070,4216,741,153800
2,30191,2008,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,183,...,10973,21017,21017,14286,6731,14286,10070,4216,741,153800
3,30192,2008,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,BARACK OBAMA,DEMOCRAT,29792,...,30576,42332,42332,37549,4783,37549,24648,12901,1080,332400
4,30193,2008,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,20576,...,30576,42332,42332,37549,4783,37549,24648,12901,1080,332400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,30586,2008,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,2353,...,6740,5161,5161,4281,880,4281,1859,2422,1064,326200
428,30587,2008,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,OTHER,OTHER,106,...,6740,5161,5161,4281,880,4281,1859,2422,1064,326200
429,30588,2008,VIRGINIA,VA,Winchester,51840,US PRESIDENT,BARACK OBAMA,DEMOCRAT,5268,...,7096,11866,11866,10454,1412,10454,5171,5283,880,241900
430,30589,2008,VIRGINIA,VA,Winchester,51840,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,4725,...,7096,11866,11866,10454,1412,10454,5171,5283,880,241900


#### 2012 election

In [56]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds206_20145_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
# dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})
def match_county_name(county_name):
  return county_name.replace(" County", "").replace(" city", "").title()

dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.head()

# merge the datasets
merged_2012 = voting_2012.merge(dem_VA, on='county_name', how='left')
merged_2012

(133, 192)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,ABGVE001,ABGWE001,ABGWE002,ABGWE003,ABGXE001,ABGXE002,ABGXE003,ABIHE001,ABIOE001,ABITE001
0,39540,2012,VIRGINIA,VA,Accomack,51001,US PRESIDENT,BARACK OBAMA,DEMOCRAT,7655,...,21054,21054,14289,6765,14289,10053,4236,715,26.2,152500
1,39541,2012,VIRGINIA,VA,Accomack,51001,US PRESIDENT,MITT ROMNEY,REPUBLICAN,8213,...,21054,21054,14289,6765,14289,10053,4236,715,26.2,152500
2,39542,2012,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,183,...,21054,21054,14289,6765,14289,10053,4236,715,26.2,152500
3,39543,2012,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,BARACK OBAMA,DEMOCRAT,29757,...,43128,43128,38537,4591,38537,25135,13402,1115,30.0,317300
4,39544,2012,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,MITT ROMNEY,REPUBLICAN,23297,...,43128,43128,38537,4591,38537,25135,13402,1115,30.0,317300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,39937,2012,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,MITT ROMNEY,REPUBLICAN,2682,...,4951,4951,4365,586,4365,2023,2342,1063,33.8,305000
422,39938,2012,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,OTHER,OTHER,163,...,4951,4951,4365,586,4365,2023,2342,1063,33.8,305000
423,39939,2012,VIRGINIA,VA,Winchester,51840,US PRESIDENT,BARACK OBAMA,DEMOCRAT,5094,...,11913,11913,10692,1221,10692,5095,5597,919,34.6,219700
424,39940,2012,VIRGINIA,VA,Winchester,51840,US PRESIDENT,MITT ROMNEY,REPUBLICAN,4946,...,11913,11913,10692,1221,10692,5095,5597,919,34.6,219700


#### 2016

In [57]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds225_20165_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
def match_county_name(county_name):
  return county_name.replace(" County", "").replace(" city", "").title()

dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.head()

# merge the datasets
merged_2016 = voting_2016.merge(dem_VA, on='county_name', how='left')
merged_2016

(133, 187)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,AF67E003,AF67E004,AF67E005,AF67E006,AF67E007,AF7PE001,AF7PE002,AF7PE003,AF89E001,AF9LE001
0,48891,2016,VIRGINIA,VA,Accomack,51001,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,6740,...,15206,14137,1069,149,11575,13819,9605,4214,749,151900
1,48892,2016,VIRGINIA,VA,Accomack,51001,US PRESIDENT,DONALD TRUMP,REPUBLICAN,8583,...,15206,14137,1069,149,11575,13819,9605,4214,749,151900
2,48893,2016,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,495,...,15206,14137,1069,149,11575,13819,9605,4214,749,151900
3,48894,2016,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,33345,...,51563,49890,1673,354,33063,39431,25584,13847,1156,317300
4,48895,2016,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,DONALD TRUMP,REPUBLICAN,19259,...,51563,49890,1673,354,33063,39431,25584,13847,1156,317300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,49288,2016,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,DONALD TRUMP,REPUBLICAN,1925,...,6663,6058,605,45,6819,4627,2157,2470,1093,319500
422,49289,2016,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,OTHER,OTHER,495,...,6663,6058,605,45,6819,4627,2157,2470,1093,319500
423,49290,2016,VIRGINIA,VA,Winchester,51840,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,5164,...,13506,12814,692,56,8205,10596,4822,5774,937,218600
424,49291,2016,VIRGINIA,VA,Winchester,51840,US PRESIDENT,DONALD TRUMP,REPUBLICAN,4790,...,13506,12814,692,56,8205,10596,4822,5774,937,218600


#### 2020 election

In [161]:
county_counts = dem_VA['county_name'].value_counts().reset_index()
county_counts.columns = ['county_name', 'row_count']
county_counts

Unnamed: 0,county_name,row_count
0,Franklin,2
1,Richmond,2
2,Fairfax,2
3,Roanoke,2
4,Surry,1
...,...,...
124,Greensville,1
125,Greene,1
126,Grayson,1
127,Goochland,1


In [170]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds249_20205_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
# def match_county_name(county_name):
#   return county_name.replace(" County", "").replace(" city", "").title()

# dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.rename(columns={'COUNTY': 'county_name'}, inplace=True)
dem_VA.head()

# merge the datasets
merged_2020 = voting_2020.merge(dem_VA, on='county_name', how='left')
merged_2020

# rows_with_nan = merged_2020[merged_2020['AMWSE004'].isna()]
# rows_with_nan

  dem = pd.read_csv(fname)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA.rename(columns={'COUNTY': 'county_name'}, inplace=True)


(133, 993)


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,AMWSE004,AMWSE005,AMWSE006,AMWSE007,AMWSE008,AMWSE009,AMWSE010,AMWSE011,AMWSE012,AMWSE013
0,70379,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,5495,...,9929,8310,2429,6248,1300,1444,102,10,414,3225
1,70380,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,2072,...,9929,8310,2429,6248,1300,1444,102,10,414,3225
2,70381,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,11,...,9929,8310,2429,6248,1300,1444,102,10,414,3225
3,70382,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,66,...,9929,8310,2429,6248,1300,1444,102,10,414,3225
4,70383,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,122,...,9929,8310,2429,6248,1300,1444,102,10,414,3225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,71966,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,OTHER,OTHER,24,...,8464,7150,1061,7067,1158,445,77,0,411,1786
1588,71967,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,OTHER,OTHER,0,...,8464,7150,1061,7067,1158,445,77,0,411,1786
1589,71968,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2825,...,8464,7150,1061,7067,1158,445,77,0,411,1786
1590,71969,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2370,...,8464,7150,1061,7067,1158,445,77,0,411,1786


In [171]:
voting_2020['county_name'].unique()

array(['Accomack County', 'Albemarle County', 'Alleghany County',
       'Amelia County', 'Amherst County', 'Appomattox County',
       'Arlington County', 'Augusta County', 'Bath County',
       'Bedford County', 'Bland County', 'Botetourt County',
       'Brunswick County', 'Buchanan County', 'Buckingham County',
       'Campbell County', 'Caroline County', 'Carroll County',
       'Charles city', 'Charlotte County', 'Chesterfield County',
       'Clarke County', 'Craig County', 'Culpeper County',
       'Cumberland County', 'Dickenson County', 'Dinwiddie County',
       'Essex County', 'Fairfax County', 'Fauquier County',
       'Floyd County', 'Fluvanna County', 'Franklin County',
       'Frederick County', 'Giles County', 'Gloucester County',
       'Goochland County', 'Grayson County', 'Greene County',
       'Greensville County', 'Halifax County', 'Hanover County',
       'Henrico County', 'Henry County', 'Highland County',
       'Isle Of Wight County', 'James city', 'King And 

In [172]:
merged_2020['county_name'].unique()

array(['Accomack County', 'Albemarle County', 'Alleghany County',
       'Amelia County', 'Amherst County', 'Appomattox County',
       'Arlington County', 'Augusta County', 'Bath County',
       'Bedford County', 'Bland County', 'Botetourt County',
       'Brunswick County', 'Buchanan County', 'Buckingham County',
       'Campbell County', 'Caroline County', 'Carroll County',
       'Charles city', 'Charlotte County', 'Chesterfield County',
       'Clarke County', 'Craig County', 'Culpeper County',
       'Cumberland County', 'Dickenson County', 'Dinwiddie County',
       'Essex County', 'Fairfax County', 'Fauquier County',
       'Floyd County', 'Fluvanna County', 'Franklin County',
       'Frederick County', 'Giles County', 'Gloucester County',
       'Goochland County', 'Grayson County', 'Greene County',
       'Greensville County', 'Halifax County', 'Hanover County',
       'Henrico County', 'Henry County', 'Highland County',
       'Isle Of Wight County', 'James city', 'King And 

## Fix table names for dfs

In [128]:
names2010 = pd.read_csv('./project_voting/ds17620105Ecodebook.csv')
names2010['NHGIS'] = names2010['NHGIS'].str.rstrip(':')
names2010['NHGIS'] = names2010['NHGIS'].str.lstrip()
names2010['Name'] = names2010['Name'].str.lstrip()
names2010_dict = names2010.set_index('NHGIS')['Name'].to_dict()

names2012 = pd.read_csv('./project_voting/ds19120125Ecodebook.csv')
names2012['NHGIS'] = names2012['NHGIS'].str.rstrip(':')
names2012['NHGIS'] = names2012['NHGIS'].str.lstrip()
names2012['Name'] = names2012['Name'].str.lstrip()
names2012_dict = names2012.set_index('NHGIS')['Name'].to_dict()

names2014 = pd.read_csv('./project_voting/ds20620145Ecodebook.csv')
names2014['NHGIS'] = names2014['NHGIS'].str.rstrip(':')
names2014['NHGIS'] = names2014['NHGIS'].str.lstrip()
names2014['Name'] = names2014['Name'].str.lstrip()
names2014_dict = names2014.set_index('NHGIS')['Name'].to_dict()

names2016 = pd.read_csv('./project_voting/ds22520165Ecodebook.csv')
names2016['NHGIS'] = names2016['NHGIS'].str.rstrip(':')
names2016['NHGIS'] = names2016['NHGIS'].str.lstrip()
names2016['Name'] = names2016['Name'].str.lstrip()
names2016_dict = names2016.set_index('NHGIS')['Name'].to_dict()

names2020 = pd.read_csv('./project_voting/ds24920205Ecodebook.csv')
names2020['NHGIS'] = names2020['NHGIS'].str.rstrip(':')
names2020['NHGIS'] = names2020['NHGIS'].str.lstrip()
names2020['Name'] = names2020['Name'].str.lstrip()
names2020_dict = names2020.set_index('NHGIS')['Name'].to_dict()

In [129]:
varnames = pd.read_csv('./project_voting/variablestouse.csv')
varnames['Name'] = varnames['Name'].str.lstrip()
varsuse = varnames['Name']
varsuse.shape
varsuse = [str(x) for x in varsuse]
varsuse

['Not Hispanic or Latino: White alone',
 'Not Hispanic or Latino: Black or African American alone',
 'Not Hispanic or Latino: American Indian and Alaska Native alone',
 'Not Hispanic or Latino: Asian alone',
 'Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone',
 'Not Hispanic or Latino: Two or more races',
 'Hispanic or Latino',
 'No schooling completed',
 'Regular high school diploma',
 'GED or alternative credential',
 'Some college, less than 1 year',
 'Some college, 1 or more years, no degree',
 "Associate's degree",
 "Bachelor's degree",
 "Master's degree",
 'Professional school degree',
 'Doctorate degree',
 'Under .50',
 '.50 to .99',
 '1.00 to 1.24',
 '1.25 to 1.49',
 '1.50 to 1.84',
 '1.85 to 1.99',
 '2.00 and over',
 'Male: 18 and 19 years',
 'Male: 20 years',
 'Male: 21 years',
 'Male: 22 to 24 years',
 'Male: 25 to 29 years',
 'Male: 30 to 34 years',
 'Male: 35 to 39 years',
 'Male: 40 to 44 years',
 'Male: 45 to 49 years',
 'Male: 50 to 54 years',
 '

In [131]:
votingnames = ['year','state','state_po','county_name','county_fips','office','candidate','party','candidatevotes','totalvotes','mode']

In [132]:
varsusefinal = votingnames + varsuse
varsusefinal

['year',
 'state',
 'state_po',
 'county_name',
 'county_fips',
 'office',
 'candidate',
 'party',
 'candidatevotes',
 'totalvotes',
 'mode',
 'Not Hispanic or Latino: White alone',
 'Not Hispanic or Latino: Black or African American alone',
 'Not Hispanic or Latino: American Indian and Alaska Native alone',
 'Not Hispanic or Latino: Asian alone',
 'Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone',
 'Not Hispanic or Latino: Two or more races',
 'Hispanic or Latino',
 'No schooling completed',
 'Regular high school diploma',
 'GED or alternative credential',
 'Some college, less than 1 year',
 'Some college, 1 or more years, no degree',
 "Associate's degree",
 "Bachelor's degree",
 "Master's degree",
 'Professional school degree',
 'Doctorate degree',
 'Under .50',
 '.50 to .99',
 '1.00 to 1.24',
 '1.25 to 1.49',
 '1.50 to 1.84',
 '1.85 to 1.99',
 '2.00 and over',
 'Male: 18 and 19 years',
 'Male: 20 years',
 'Male: 21 years',
 'Male: 22 to 24 years',
 'Male: 25

In [133]:
merged_2008.rename(columns=names2012_dict,inplace=True)
merged_2008
merged_2008=merged_2008[varsusefinal]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_2008.rename(columns=names2012_dict,inplace=True)


In [134]:
merged_2012.rename(columns=names2014_dict,inplace=True)
merged_2012
merged_2012=merged_2012[varsusefinal]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_2012.rename(columns=names2014_dict,inplace=True)


In [135]:
merged_2016.rename(columns=names2016_dict,inplace=True)
merged_2016
merged_2016=merged_2016[varsusefinal]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_2016.rename(columns=names2016_dict,inplace=True)


In [136]:
merged_2016.columns

Index(['year', 'state', 'state_po', 'county_name', 'county_fips', 'office',
       'candidate', 'party', 'candidatevotes', 'totalvotes', 'mode',
       'Not Hispanic or Latino: White alone',
       'Not Hispanic or Latino: Black or African American alone',
       'Not Hispanic or Latino: American Indian and Alaska Native alone',
       'Not Hispanic or Latino: Asian alone',
       'Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone',
       'Not Hispanic or Latino: Two or more races', 'Hispanic or Latino',
       'No schooling completed', 'Regular high school diploma',
       'GED or alternative credential', 'Some college, less than 1 year',
       'Some college, 1 or more years, no degree', 'Associate's degree',
       'Bachelor's degree', 'Master's degree', 'Professional school degree',
       'Doctorate degree', 'Under .50', '.50 to .99', '1.00 to 1.24',
       '1.25 to 1.49', '1.50 to 1.84', '1.85 to 1.99', '2.00 and over',
       'Male: 18 and 19 years', 'Mal

In [173]:
merged_2020.rename(columns=names2020_dict,inplace=True)
merged_2020
merged_2020=merged_2020[varsusefinal]

In [174]:
merged_2020.shape

(1592, 73)

For merged_2020, get the total votes for each party rather than having them split by mode

In [175]:
merged_2020.columns

Index(['year', 'state', 'state_po', 'county_name', 'county_fips', 'office',
       'candidate', 'party', 'candidatevotes', 'totalvotes', 'mode',
       'Not Hispanic or Latino: White alone',
       'Not Hispanic or Latino: Black or African American alone',
       'Not Hispanic or Latino: American Indian and Alaska Native alone',
       'Not Hispanic or Latino: Asian alone',
       'Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone',
       'Not Hispanic or Latino: Two or more races', 'Hispanic or Latino',
       'No schooling completed', 'Regular high school diploma',
       'GED or alternative credential', 'Some college, less than 1 year',
       'Some college, 1 or more years, no degree', 'Associate's degree',
       'Bachelor's degree', 'Master's degree', 'Professional school degree',
       'Doctorate degree', 'Under .50', '.50 to .99', '1.00 to 1.24',
       '1.25 to 1.49', '1.50 to 1.84', '1.85 to 1.99', '2.00 and over',
       'Male: 18 and 19 years', 'Mal

In [176]:
merged_2020

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,...,Female: 50 to 54 years,Female: 55 to 59 years,Female: 60 and 61 years,Female: 62 to 64 years,Female: 65 and 66 years,Female: 67 to 69 years,Female: 70 to 74 years,Female: 75 to 79 years,Female: 80 to 84 years,Female: 85 years and over
0,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,5495,16962,...,892,1743,310,798,575,780,1027,626,540,588
1,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,2072,16962,...,892,1743,310,798,575,780,1027,626,540,588
2,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,11,16962,...,892,1743,310,798,575,780,1027,626,540,588
3,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,66,16962,...,892,1743,310,798,575,780,1027,626,540,588
4,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,122,16962,...,892,1743,310,798,575,780,1027,626,540,588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,OTHER,OTHER,24,12113,...,1074,781,322,532,338,500,677,511,285,338
1588,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,OTHER,OTHER,0,12113,...,1074,781,322,532,338,500,677,511,285,338
1589,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2825,12113,...,1074,781,322,532,338,500,677,511,285,338
1590,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2370,12113,...,1074,781,322,532,338,500,677,511,285,338


In [177]:
filtered_2020 = merged_2020[merged_2020['mode'] == 'ELECTION DAY']
filtered_2020['mode'].unique()

array(['ELECTION DAY'], dtype=object)

In [184]:
filtered_2020

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,...,Female: 50 to 54 years,Female: 55 to 59 years,Female: 60 and 61 years,Female: 62 to 64 years,Female: 65 and 66 years,Female: 67 to 69 years,Female: 70 to 74 years,Female: 75 to 79 years,Female: 80 to 84 years,Female: 85 years and over
1,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,2072,16962,...,892,1743,310,798,575,780,1027,626,540,588
4,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,122,16962,...,892,1743,310,798,575,780,1027,626,540,588
7,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,OTHER,OTHER,12,16962,...,892,1743,310,798,575,780,1027,626,540,588
10,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,6078,16962,...,892,1743,310,798,575,780,1027,626,540,588
13,2020,VIRGINIA,VA,Albemarle County,51003,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,8716,64657,...,3477,4079,1594,2181,1525,2191,2495,2086,1582,1468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1578,2020,VIRGINIA,VA,Williamsburg city,51830,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,615,6890,...,291,367,163,278,64,317,312,251,297,83
1581,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,1466,12113,...,1074,781,322,532,338,500,677,511,285,338
1584,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,103,12113,...,1074,781,322,532,338,500,677,511,285,338
1587,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,OTHER,OTHER,24,12113,...,1074,781,322,532,338,500,677,511,285,338


In [178]:
count_election_day = (merged_2020['mode'] == 'ELECTION DAY').sum()
count_election_day

532

In [179]:
dem_2020 = merged_2020[merged_2020['party'] == 'DEMOCRAT']
rep_2020 = merged_2020[merged_2020['party'] == 'REPUBLICAN']
lib_2020 = merged_2020[merged_2020['party'] == 'LIBERTARIAN']
other_2020 = merged_2020[merged_2020['party'] == 'OTHER']

In [180]:
dem_test=dem_2020.groupby(['county_name'])['candidatevotes'].sum().reset_index()
rep_test=rep_2020.groupby(['county_name'])['candidatevotes'].sum().reset_index()
lib_test=lib_2020.groupby(['county_name'])['candidatevotes'].sum().reset_index()
other_test=other_2020.groupby(['county_name'])['candidatevotes'].sum().reset_index()


In [181]:
print(dem_test.shape)
print(rep_test.shape)
print(lib_test.shape)
print(other_test.shape)

(133, 2)
(133, 2)
(133, 2)
(133, 2)


In [185]:
dfcounts=merged_2020.groupby(['county_name', 'party'])['candidatevotes'].sum().reset_index()
dfcounts['candidatevotes']

0       7578
1        188
2         24
3       9172
4      42466
       ...  
527    11733
528    17683
529      680
530      187
531    20241
Name: candidatevotes, Length: 532, dtype: int64

In [186]:
merged_2020.groupby(['county_name', 'party'])['candidatevotes'].sum().reset_index()
filtered_2020['totalvotes'] = dfcounts['candidatevotes']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_2020['totalvotes'] = dfcounts['candidatevotes']


In [187]:
filtered_2020

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,...,Female: 50 to 54 years,Female: 55 to 59 years,Female: 60 and 61 years,Female: 62 to 64 years,Female: 65 and 66 years,Female: 67 to 69 years,Female: 70 to 74 years,Female: 75 to 79 years,Female: 80 to 84 years,Female: 85 years and over
1,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,2072,188.0,...,892,1743,310,798,575,780,1027,626,540,588
4,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,122,42466.0,...,892,1743,310,798,575,780,1027,626,540,588
7,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,OTHER,OTHER,12,20804.0,...,892,1743,310,798,575,780,1027,626,540,588
10,2020,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,6078,715.0,...,892,1743,310,798,575,780,1027,626,540,588
13,2020,VIRGINIA,VA,Albemarle County,51003,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,8716,89.0,...,3477,4079,1594,2181,1525,2191,2495,2086,1582,1468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1578,2020,VIRGINIA,VA,Williamsburg city,51830,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,615,,...,291,367,163,278,64,317,312,251,297,83
1581,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,1466,,...,1074,781,322,532,338,500,677,511,285,338
1584,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,103,,...,1074,781,322,532,338,500,677,511,285,338
1587,2020,VIRGINIA,VA,Winchester city,51840,US PRESIDENT,OTHER,OTHER,24,,...,1074,781,322,532,338,500,677,511,285,338


In [188]:
# Reset indices
merged_2008.reset_index(drop=True, inplace=True)
merged_2012.reset_index(drop=True, inplace=True)
merged_2016.reset_index(drop=True, inplace=True)
filtered_2020.reset_index(drop=True, inplace=True)

# Combine DataFrames
combined_df = pd.concat([merged_2008, merged_2012, merged_2016, merged_2020], ignore_index=True)

In [189]:
combined_df.columns

Index(['year', 'state', 'state_po', 'county_name', 'county_fips', 'office',
       'candidate', 'party', 'candidatevotes', 'totalvotes', 'mode',
       'Not Hispanic or Latino: White alone',
       'Not Hispanic or Latino: Black or African American alone',
       'Not Hispanic or Latino: American Indian and Alaska Native alone',
       'Not Hispanic or Latino: Asian alone',
       'Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone',
       'Not Hispanic or Latino: Two or more races', 'Hispanic or Latino',
       'No schooling completed', 'Regular high school diploma',
       'GED or alternative credential', 'Some college, less than 1 year',
       'Some college, 1 or more years, no degree', 'Associate's degree',
       'Bachelor's degree', 'Master's degree', 'Professional school degree',
       'Doctorate degree', 'Under .50', '.50 to .99', '1.00 to 1.24',
       '1.25 to 1.49', '1.50 to 1.84', '1.85 to 1.99', '2.00 and over',
       'Male: 18 and 19 years', 'Mal

In [74]:
combined_df
combined_df.to_csv('voting_est.csv', index=False)