<a href="https://colab.research.google.com/github/Catherine-Nguyen88/project_voting/blob/main/merging_and_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script for merging and data cleaning

In [1]:
# clone from repo
! git clone https://github.com/Catherine-Nguyen88/project_voting

Cloning into 'project_voting'...
remote: Enumerating objects: 190, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 190 (delta 49), reused 45 (delta 18), pack-reused 90[K
Receiving objects: 100% (190/190), 65.56 MiB | 7.88 MiB/s, done.
Resolving deltas: 100% (85/85), done.
Updating files: 100% (65/65), done.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.linear_model import LinearRegression

## Merging for estimates

In [5]:
voting_original

Unnamed: 0.1,Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
0,11161,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,AL GORE,DEMOCRAT,5092,11925,20220315,TOTAL
1,11162,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,6352,11925,20220315,TOTAL
2,11163,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,RALPH NADER,GREEN,220,11925,20220315,TOTAL
3,11164,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,OTHER,OTHER,261,11925,20220315,TOTAL
4,11165,2000,VIRGINIA,VA,ALBEMARLE,51003,US PRESIDENT,AL GORE,DEMOCRAT,16255,36846,20220315,TOTAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3731,71966,2020,VIRGINIA,VA,WINCHESTER CITY,51840,US PRESIDENT,OTHER,OTHER,24,12113,20220315,ELECTION DAY
3732,71967,2020,VIRGINIA,VA,WINCHESTER CITY,51840,US PRESIDENT,OTHER,OTHER,0,12113,20220315,PROVISIONAL
3733,71968,2020,VIRGINIA,VA,WINCHESTER CITY,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2825,12113,20220315,ABSENTEE
3734,71969,2020,VIRGINIA,VA,WINCHESTER CITY,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2370,12113,20220315,ELECTION DAY


In [33]:
voting_original = pd.read_csv('./project_voting/data/voting_VA.csv')
voting_original.head(5)

# modify voting CSV
def match_county_name(county_name):
  # if county_name.endswith('CITY'):
  #   capitalized_county_name = ' '.join(word.capitalize() if word.lower() != 'city' else word.lower() for word in county_name.split())
  #   return capitalized_county_name
  # else:
  #   return county_name.capitalize() + ' County'
  return county_name.replace(" COUNTY", "").replace(" CITY", "").title()

voting_df = voting_original.copy()
voting_df['county_name'] = voting_df['county_name'].apply(lambda x: match_county_name(x))
voting_df.head()

# now, for each county, get candidatevotes/totalvotes
voting_df['fractionalvotes'] = voting_df['candidatevotes']/voting_df['totalvotes']

# split voting data into separate years for merging
voting_2000 = voting_df[voting_df['year']==2000]
print(f'Years for voting_2000 {voting_2000["year"].unique()}')
voting_2004 = voting_df[voting_df['year']==2004]
print(f'Years for voting_2004 {voting_2004["year"].unique()}')
voting_2008 = voting_df[voting_df['year']==2008]
print(f'Years for voting_2008 {voting_2008["year"].unique()}')
voting_2012 = voting_df[voting_df['year']==2012]
print(f'Years for voting_2012 {voting_2012["year"].unique()}')
voting_2016 = voting_df[voting_df['year']==2016]
print(f'Years for voting_2016 {voting_2016["year"].unique()}')
voting_2020 = voting_df[voting_df['year']==2020]
print(f'Years for voting_2020 {voting_2020["year"].unique()}')

# looks correct

Years for voting_2000 [2000]
Years for voting_2004 [2004]
Years for voting_2008 [2008]
Years for voting_2012 [2012]
Years for voting_2016 [2016]
Years for voting_2020 [2020]


In [34]:
voting_2020

Unnamed: 0.1,Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode,fractionalvotes
2144,70379,2020,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,5495,16962,20220315,ABSENTEE,0.323959
2145,70380,2020,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,2072,16962,20220315,ELECTION DAY,0.122155
2146,70381,2020,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,11,16962,20220315,PROVISIONAL,0.000649
2147,70382,2020,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,66,16962,20220315,ABSENTEE,0.003891
2148,70383,2020,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,122,16962,20220315,ELECTION DAY,0.007193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3731,71966,2020,VIRGINIA,VA,Winchester,51840,US PRESIDENT,OTHER,OTHER,24,12113,20220315,ELECTION DAY,0.001981
3732,71967,2020,VIRGINIA,VA,Winchester,51840,US PRESIDENT,OTHER,OTHER,0,12113,20220315,PROVISIONAL,0.000000
3733,71968,2020,VIRGINIA,VA,Winchester,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2825,12113,20220315,ABSENTEE,0.233221
3734,71969,2020,VIRGINIA,VA,Winchester,51840,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,2370,12113,20220315,ELECTION DAY,0.195658


#### 2000 election

In [None]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds176_20105_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})

# merge the datasets
merged_2000 = voting_2000.merge(dem_VA, on='county_name', how='left')
merged_2000.head(5)

(0, 276)


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,JSDE003,JSDE004,JSDE005,JSDE006,JSDE007,JSDE008,JSDE009,JSDE010,JS5E001,JTIE001
0,11161,2000,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,AL GORE,DEMOCRAT,5092,...,,,,,,,,,,
1,11162,2000,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,6352,...,,,,,,,,,,
2,11163,2000,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,RALPH NADER,GREEN,220,...,,,,,,,,,,
3,11164,2000,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,OTHER,OTHER,261,...,,,,,,,,,,
4,11165,2000,VIRGINIA,VA,Albemarle County,51003,US PRESIDENT,AL GORE,DEMOCRAT,16255,...,,,,,,,,,,


#### 2004 election

In [None]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds176_20105_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})

# merge the datasets
merged_2004 = voting_2004.merge(dem_VA, on='county_name', how='left')
merged_2004.head(5)

(0, 276)


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,JSDE003,JSDE004,JSDE005,JSDE006,JSDE007,JSDE008,JSDE009,JSDE010,JS5E001,JTIE001
0,20838,2004,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,JOHN KERRY,DEMOCRAT,5518,...,,,,,,,,,,
1,20839,2004,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,7726,...,,,,,,,,,,
2,20840,2004,VIRGINIA,VA,Accomack County,51001,US PRESIDENT,OTHER,OTHER,112,...,,,,,,,,,,
3,20841,2004,VIRGINIA,VA,Albemarle County,51003,US PRESIDENT,JOHN KERRY,DEMOCRAT,22088,...,,,,,,,,,,
4,20842,2004,VIRGINIA,VA,Albemarle County,51003,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,21189,...,,,,,,,,,,


#### 2008 election

In [16]:
print(dem_VA.columns)

Index(['Unnamed: 0', 'GISJOIN', 'YEAR', 'STUSAB', 'REGIONA', 'DIVISIONA',
       'STATE', 'STATEA', 'county_name', 'COUNTYA',
       ...
       'QXSE007', 'QX6E001', 'QX7E001', 'QX7E002', 'QX7E003', 'QX8E001',
       'QX8E002', 'QX8E003', 'QZTE001', 'QZ6E001'],
      dtype='object', length=191)


In [35]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds191_20125_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
# dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})

def match_county_name(county_name):
  return county_name.replace(" County", "").replace(" city", "")

dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.head()

# merge the datasets
merged_2008 = voting_2008.merge(dem_VA, on='county_name', how='left')
merged_2008

(134, 191)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,QXSE007,QX6E001,QX7E001,QX7E002,QX7E003,QX8E001,QX8E002,QX8E003,QZTE001,QZ6E001
0,30189,2008,VIRGINIA,VA,Accomack,51001,US PRESIDENT,BARACK OBAMA,DEMOCRAT,7607,...,10973,21017,21017,14286,6731,14286,10070,4216,741,153800
1,30190,2008,VIRGINIA,VA,Accomack,51001,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,7833,...,10973,21017,21017,14286,6731,14286,10070,4216,741,153800
2,30191,2008,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,183,...,10973,21017,21017,14286,6731,14286,10070,4216,741,153800
3,30192,2008,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,BARACK OBAMA,DEMOCRAT,29792,...,30576,42332,42332,37549,4783,37549,24648,12901,1080,332400
4,30193,2008,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,20576,...,30576,42332,42332,37549,4783,37549,24648,12901,1080,332400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,30586,2008,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,2353,...,6740,5161,5161,4281,880,4281,1859,2422,1064,326200
428,30587,2008,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,OTHER,OTHER,106,...,6740,5161,5161,4281,880,4281,1859,2422,1064,326200
429,30588,2008,VIRGINIA,VA,Winchester,51840,US PRESIDENT,BARACK OBAMA,DEMOCRAT,5268,...,7096,11866,11866,10454,1412,10454,5171,5283,880,241900
430,30589,2008,VIRGINIA,VA,Winchester,51840,US PRESIDENT,JOHN MCCAIN,REPUBLICAN,4725,...,7096,11866,11866,10454,1412,10454,5171,5283,880,241900


#### 2012 election

In [43]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds206_20145_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
# dem_VA = dem_VA.rename(columns={'COUNTY':'county_name'})
def match_county_name(county_name):
  return county_name.replace(" County", "").replace(" city", "").title()

dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.head()

# merge the datasets
merged_2012 = voting_2012.merge(dem_VA, on='county_name', how='left')
merged_2012

(133, 192)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,ABGVE001,ABGWE001,ABGWE002,ABGWE003,ABGXE001,ABGXE002,ABGXE003,ABIHE001,ABIOE001,ABITE001
0,39540,2012,VIRGINIA,VA,Accomack,51001,US PRESIDENT,BARACK OBAMA,DEMOCRAT,7655,...,21054,21054,14289,6765,14289,10053,4236,715,26.2,152500
1,39541,2012,VIRGINIA,VA,Accomack,51001,US PRESIDENT,MITT ROMNEY,REPUBLICAN,8213,...,21054,21054,14289,6765,14289,10053,4236,715,26.2,152500
2,39542,2012,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,183,...,21054,21054,14289,6765,14289,10053,4236,715,26.2,152500
3,39543,2012,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,BARACK OBAMA,DEMOCRAT,29757,...,43128,43128,38537,4591,38537,25135,13402,1115,30.0,317300
4,39544,2012,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,MITT ROMNEY,REPUBLICAN,23297,...,43128,43128,38537,4591,38537,25135,13402,1115,30.0,317300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,39937,2012,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,MITT ROMNEY,REPUBLICAN,2682,...,4951,4951,4365,586,4365,2023,2342,1063,33.8,305000
422,39938,2012,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,OTHER,OTHER,163,...,4951,4951,4365,586,4365,2023,2342,1063,33.8,305000
423,39939,2012,VIRGINIA,VA,Winchester,51840,US PRESIDENT,BARACK OBAMA,DEMOCRAT,5094,...,11913,11913,10692,1221,10692,5095,5597,919,34.6,219700
424,39940,2012,VIRGINIA,VA,Winchester,51840,US PRESIDENT,MITT ROMNEY,REPUBLICAN,4946,...,11913,11913,10692,1221,10692,5095,5597,919,34.6,219700


#### 2016

In [56]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds225_20165_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
def match_county_name(county_name):
  return county_name.replace(" County", "").replace(" city", "").title()

dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.head()

# merge the datasets
merged_2016 = voting_2016.merge(dem_VA, on='county_name', how='left')
merged_2016

(133, 187)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,AF67E003,AF67E004,AF67E005,AF67E006,AF67E007,AF7PE001,AF7PE002,AF7PE003,AF89E001,AF9LE001
0,48891,2016,VIRGINIA,VA,Accomack,51001,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,6740,...,15206,14137,1069,149,11575,13819,9605,4214,749,151900
1,48892,2016,VIRGINIA,VA,Accomack,51001,US PRESIDENT,DONALD TRUMP,REPUBLICAN,8583,...,15206,14137,1069,149,11575,13819,9605,4214,749,151900
2,48893,2016,VIRGINIA,VA,Accomack,51001,US PRESIDENT,OTHER,OTHER,495,...,15206,14137,1069,149,11575,13819,9605,4214,749,151900
3,48894,2016,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,33345,...,51563,49890,1673,354,33063,39431,25584,13847,1156,317300
4,48895,2016,VIRGINIA,VA,Albemarle,51003,US PRESIDENT,DONALD TRUMP,REPUBLICAN,19259,...,51563,49890,1673,354,33063,39431,25584,13847,1156,317300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,49288,2016,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,DONALD TRUMP,REPUBLICAN,1925,...,6663,6058,605,45,6819,4627,2157,2470,1093,319500
422,49289,2016,VIRGINIA,VA,Williamsburg,51830,US PRESIDENT,OTHER,OTHER,495,...,6663,6058,605,45,6819,4627,2157,2470,1093,319500
423,49290,2016,VIRGINIA,VA,Winchester,51840,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,5164,...,13506,12814,692,56,8205,10596,4822,5774,937,218600
424,49291,2016,VIRGINIA,VA,Winchester,51840,US PRESIDENT,DONALD TRUMP,REPUBLICAN,4790,...,13506,12814,692,56,8205,10596,4822,5774,937,218600


#### 2020 election

In [61]:
# extract demographic data
fname = './project_voting/data/county_data/0002_ds249_20205_county_E.csv'
dem = pd.read_csv(fname)
# need to drop the 1st row
dem = dem.drop([0])
dem_VA = dem[dem['STATE'] == 'Virginia']
print(dem_VA.shape) # the shape is correct for VA
# rename for merging
def match_county_name(county_name):
  return county_name.replace(" County", "").replace(" city", "").title()

dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))
dem_VA.head()

# merge the datasets
merged_2020 = voting_2020.merge(dem_VA, on='county_name', how='left')
merged_2020

rows_with_nan = merged_2020[merged_2020['AMWSE004'].isna()]
rows_with_nan

  dem = pd.read_csv(fname)


(133, 993)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dem_VA['county_name'] = dem_VA['COUNTY'].apply(lambda x: match_county_name(x))


Unnamed: 0,Unnamed: 0_x,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,...,AMWSE004,AMWSE005,AMWSE006,AMWSE007,AMWSE008,AMWSE009,AMWSE010,AMWSE011,AMWSE012,AMWSE013
216,70595,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,2024,...,,,,,,,,,,
217,70596,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,592,...,,,,,,,,,,
218,70597,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,8,...,,,,,,,,,,
219,70598,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,21,...,,,,,,,,,,
220,70599,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,33,...,,,,,,,,,,
221,70600,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,0,...,,,,,,,,,,
222,70601,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,OTHER,OTHER,2,...,,,,,,,,,,
223,70602,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,OTHER,OTHER,0,...,,,,,,,,,,
224,70603,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,OTHER,OTHER,0,...,,,,,,,,,,
225,70604,2020,VIRGINIA,VA,Charles,51036,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,811,...,,,,,,,,,,


## Fix table names for dfs

In [62]:
names2010 = pd.read_csv('./project_voting/ds17620105Ecodebook.csv')
names2010['NHGIS'] = names2010['NHGIS'].str.rstrip(':')
names2010['NHGIS'] = names2010['NHGIS'].str.lstrip()
names2010['Name'] = names2010['Name'].str.lstrip()
names2010_dict = names2010.set_index('NHGIS')['Name'].to_dict()

names2012 = pd.read_csv('./project_voting/ds19120125Ecodebook.csv')
names2012['NHGIS'] = names2012['NHGIS'].str.rstrip(':')
names2012['NHGIS'] = names2012['NHGIS'].str.lstrip()
names2012['Name'] = names2012['Name'].str.lstrip()
names2012_dict = names2012.set_index('NHGIS')['Name'].to_dict()

names2014 = pd.read_csv('./project_voting/ds20620145Ecodebook.csv')
names2014['NHGIS'] = names2014['NHGIS'].str.rstrip(':')
names2014['NHGIS'] = names2014['NHGIS'].str.lstrip()
names2014['Name'] = names2014['Name'].str.lstrip()
names2014_dict = names2014.set_index('NHGIS')['Name'].to_dict()

names2016 = pd.read_csv('./project_voting/ds22520165Ecodebook.csv')
names2016['NHGIS'] = names2016['NHGIS'].str.rstrip(':')
names2016['NHGIS'] = names2016['NHGIS'].str.lstrip()
names2016['Name'] = names2016['Name'].str.lstrip()
names2016_dict = names2016.set_index('NHGIS')['Name'].to_dict()

names2020 = pd.read_csv('./project_voting/ds24920205Ecodebook.csv')
names2020['NHGIS'] = names2020['NHGIS'].str.rstrip(':')
names2020['NHGIS'] = names2020['NHGIS'].str.lstrip()
names2020['Name'] = names2020['Name'].str.lstrip()
names2020_dict = names2020.set_index('NHGIS')['Name'].to_dict()

In [None]:
varnames = pd.read_csv('./project_voting/variablestouse.csv')
varnames['Name'] = varnames['Name'].str.lstrip()
varsuse = varnames['Name']
varsuse.shape
varsuse = [str(x) for x in varsuse]
varsuse

In [27]:
votingnames = ['year','state','state_po','county_name','county_fips','office','candidate','party','candidatevotes','totalvotes']

In [None]:
varsusefinal = votingnames + varsuse
varsusefinal

In [73]:
merged_2008.rename(columns=names2012_dict,inplace=True)
merged_2008
merged_2008=merged_2008[varsusefinal]

In [74]:
merged_2012.rename(columns=names2014_dict,inplace=True)
merged_2012
merged_2012=merged_2012[varsusefinal]

In [75]:
merged_2016.rename(columns=names2016_dict,inplace=True)
merged_2016
merged_2016=merged_2016[varsusefinal]

In [98]:
merged_2016.columns

Index(['year', 'state', 'state_po', 'county_name', 'county_fips', 'office',
       'candidate', 'party', 'candidatevotes', 'totalvotes',
       'Not Hispanic or Latino: White alone',
       'Not Hispanic or Latino: Black or African American alone',
       'Not Hispanic or Latino: American Indian and Alaska Native alone',
       'Not Hispanic or Latino: Asian alone',
       'Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone',
       'Not Hispanic or Latino: Two or more races', 'Hispanic or Latino',
       'No schooling completed', 'Regular high school diploma',
       'GED or alternative credential', 'Some college, less than 1 year',
       'Some college, 1 or more years, no degree', 'Associate's degree',
       'Bachelor's degree', 'Master's degree', 'Professional school degree',
       'Doctorate degree', 'Under .50', '.50 to .99', '1.00 to 1.24',
       '1.25 to 1.49', '1.50 to 1.84', '1.85 to 1.99', '2.00 and over',
       'Male: 18 and 19 years', 'Male: 20 ye

In [76]:
merged_2020.rename(columns=names2020_dict,inplace=True)
merged_2020
merged_2020=merged_2020[varsusefinal]

In [99]:
merged_2020.columns

Index(['year', 'state', 'state_po', 'county_name', 'county_fips', 'office',
       'candidate', 'party', 'candidatevotes', 'totalvotes',
       'Not Hispanic or Latino: White alone',
       'Not Hispanic or Latino: Black or African American alone',
       'Not Hispanic or Latino: American Indian and Alaska Native alone',
       'Not Hispanic or Latino: Asian alone',
       'Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone',
       'Not Hispanic or Latino: Two or more races', 'Hispanic or Latino',
       'Hispanic or Latino', 'No schooling completed',
       'Regular high school diploma', 'GED or alternative credential',
       'Some college, less than 1 year',
       'Some college, 1 or more years, no degree', 'Associate's degree',
       'Bachelor's degree', 'Master's degree', 'Professional school degree',
       'Doctorate degree', 'Under .50', '.50 to .99', '1.00 to 1.24',
       '1.25 to 1.49', '1.50 to 1.84', '1.85 to 1.99', '2.00 and over',
       'Male: 1