-
Notifications
You must be signed in to change notification settings - Fork 44
/
read_data.Rmd
241 lines (205 loc) · 10.2 KB
/
read_data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
---
title: "Add_Demos"
author: "Chris Dick"
date: "January 28, 2017"
output:
md_document:
variant: markdown_github
---
# Add Demographic Variables to Registration and Voting Data
The code below is an example of adding demographic data to our voting and registration data. At this point I have only added age by race variables from the 2015 5-year American Community Survey data. However, the code below can be relatively easily adapted to pull anything else from the ACS.
## What you will need to get started:
- A Census API Token, if you don't already have one it is super easy. Go here to request: http://api.census.gov/data/key_signup.html
- To use the below code you will need the updated uselections R package, which can be downloaded here using the devtools package and the command `devtools::install_github("Data4Democracy/election-transparency/r-packages/uselections")`
```{r, results='hide', message=FALSE}
library(dplyr, quietly = TRUE)
library(uselections, quietly = TRUE) # I am pulling the data from the R package, could use data.world
library(knitr, quietly = TRUE)
library(acs, quietly = TRUE)
library(reshape2, quietly = TRUE)
library(stringr, quietly = TRUE)
library(tidyr, quietly = TRUE)
```
## Pulling and cleaning the data
First we need to pull the data from the Census API. The below code pulls data at the County level by age, race, sex, and Hispanic origin as well as employment data.
```{r acs, results='hide', message=FALSE}
api.key.install(key = '') # You will need to enter an API key
geokey <- geo.make(state = "*",
county = "*")
# Set list of attributes you want to pull, you will want to look for the table numbers.
pull_list <- c("B01001A", # Sex by Age: White Alone
"B01001B", # Sex by Age: Black Alone
"B01001C", # Sex by Age: AIAN Alone
"B01001D", # Sex by Age: Asian Alone
"B01001E", # Sex by Age: NHPI Alone
"B01001F", # Sex by Age: Some Other Race Alone
"B01001G", # Sex by Age: Multiple Races
"B01001H", # Sex by Age: White Alone Non-Hispanic
"B01001I", # Sex by Age: Hispanic or Latino
"B23001") # Employment
# Define function to pull data and dump into a data frame
pull <- function(pull){
df <- acs.fetch(endyear = 2015, span = 5, geography = geokey, table.number = pull,
col.name = "pretty")
new_df <- data.frame(paste0(str_pad(df@geography$state, 2, "left", pad="0"),
str_pad(df@geography$county, 3, "left", pad="0")),
df@estimate,
stringsAsFactors = FALSE)
names(new_df)[1] <- 'geoid'
return(new_df)
}
#Apply this function over our list of tables, the result will be a list of DFs
dfs <- lapply(pull_list, pull)
# split out the data into groups that can be processed together:
df_nhwa <- dfs[[8]] # For the demographics, Non-Hispanic White alone must be separate
df_emply <- dfs[[10]] # Employment data
df_educ <- dfs[[11]]
dfs[8] <- dfs[10] <- dfs[11] <- NULL # Rest of the demographic data that can be processed together
demo_frame <- data.frame(NULL)
# Join together all of your tables. I am doing it this way because all of my tables have the same
# dimensions and are measured the same way. If I had tables with different demographic dimensions
# I would want to clean them separately, so group and join your tables accordingly.
for (i in 1:length(dfs)){
if (i == 1){
demo_frame <- dfs[[i]]
} else{
demo_frame <- demo_frame %>% inner_join(dfs[[i]], by = "geoid")
}
}
```
### Race and Hispanic Origin by Age
Now that we have all of our data in on big data frame with a ton of columns, I want to work on getting something more usable. For these age, sex, race, Hispanic origin data I will give us the component parts to be able to calculate other important variables. Accordingly, I leave the data separated by race / Hispanic origin, but group age into 2 categories:
1. Under 18 Years Old
2. Over 18 Years Old (Voting Age Population)
We may want to make different decisions in the future, but I think this is a good start.
```{r clean, results='hide', message=FALSE}
# Start with the data set with everything but the Non-Hispanic White Alone data
demo_frame1 <- demo_frame %>%
melt(id = "geoid") %>%
separate(variable, into = c("variable", "race", "sex", "age"), fill = "right", sep = "\\.\\.") %>%
select(-2) %>%
filter(!is.na(age)) %>%
dcast(geoid + race + sex ~ age, value.var = "value") %>%
mutate(under18 = Under.5.years + `5.to.9.years` + `10.to.14.years` + `15.to.17.years`,
votingAge = `18.and.19.years` + `20.to.24.years` + `25.to.29.years` +
`30.to.34.years` + `35.to.44.years` + `45.to.54.years` + `55.to.64.years`
+ `65.to.74.years` + `75.to.84.years` + `85.years.and.over`) %>%
select(-(4:17)) %>%
mutate(sex = ifelse(sex == ".Female", "female", "male"),
race = ifelse(race == "AMERICAN.INDIAN.AND.ALASKA.NATIVE.ALONE", "aian",
ifelse(race == "ASIAN.ALONE", "asian",
ifelse(race == "BLACK.OR.AFRICAN.AMERICAN.ALONE", "black",
ifelse(race == "HISPANIC.OR.LATINO", "hisp",
ifelse(race == "NATIVE.HAWAIIAN.AND.OTHER.PACIFIC.ISLANDER.ALONE", "nhpi",
ifelse(race == "SOME.OTHER.RACE.ALONE", "sor",
ifelse(race == "TWO.OR.MORE.RACES", "multi", "white")))))))) %>%
melt() %>%
dcast(geoid + race + variable ~ sex) %>%
mutate(pop = male + female) %>%
select(-(4:5)) %>%
dcast(geoid ~ race + variable)
# Clean the Non-Hispanic White Alone data
demo_frame2 <- df_nhwa %>%
melt(id = "geoid") %>%
separate(variable, into = c("variable", "race", "race2", "sex", "age"),
fill = "right", sep = "\\.\\.") %>%
select(-2, -4) %>%
filter(!is.na(age)) %>%
dcast(geoid + race + sex ~ age, value.var = "value") %>%
mutate(under18 = Under.5.years + `5.to.9.years` + `10.to.14.years` + `15.to.17.years`,
votingAge = `18.and.19.years` + `20.to.24.years` + `25.to.29.years` +
`30.to.34.years` + `35.to.44.years` + `45.to.54.years` + `55.to.64.years`
+ `65.to.74.years` + `75.to.84.years` + `85.years.and.over`) %>%
select(-(4:17)) %>%
mutate(sex = ifelse(sex == ".Female", "female", "male"),
race = "nhwa") %>%
melt() %>%
dcast(geoid + race + variable ~ sex) %>%
mutate(pop = male + female) %>%
select(-(4:5)) %>%
dcast(geoid ~ race + variable) %>%
inner_join(demo_frame1, by = "geoid") %>%
filter(as.numeric(geoid) < 72000)
```
The race categories are labelled as:
1. nhwa: Non-Hispanic White Alone
2. aian: American Indian or Alaska Native
3. asian: Asian
4. black: Black or African American
5. hisp: Hispanic or Latino (Any Race)
6. multi: 2 or More Races
7. nhpi: Native Hawaiian or other Pacific Islander
8. sor: Some other Race
9. white: White
To get the total population sum the following categories:
* aian
* asian
* black
* multi
* nhpi
* sor
* white
### Employment
```{r employment, results='hide', message=FALSE}
# Start by parsing the variables the same way we did with the race data above
demo_frame3 <- df_emply %>%
melt(id = "geoid") %>%
separate(variable, into = c("variable", "sex", "age", "lf", "occ1", "occ2"),
fill = "right", sep = "\\.\\.")
# Create a file with the total 16+ population for merging at the end
df_checktot <- demo_frame3 %>%
filter(sex == "Total." & is.na(age)) %>%
select(-(2:7)) %>%
dplyr::rename(total_16plus = value)
# Next get the data on number of people in and out of the labor force by age groups
demo_frame3a <- demo_frame3 %>%
filter(!(sex == "Total." & is.na(age)) & !is.na(age) & !is.na(lf) & is.na(occ1)) %>%
select(-2, -6, -7) %>%
dcast(geoid + age ~ sex + lf) %>%
mutate(inLaborForce = Female_In.labor.force. + Male_In.labor.force.,
outLaborForce = Female_Not.in.labor.force + Male_Not.in.labor.force) %>%
select(-(3:6)) %>%
melt() %>%
dcast(geoid + variable ~ age) %>%
dplyr::rename(lm = variable) %>%
mutate(`16to19` = `16.to.19.years`,
`20to24` = `20.and.21.years` + `22.to.24.years`,
`25to54` = `25.to.29.years` + `30.to.34.years` + `35.to.44.years` + `45.to.54.years`,
`55to64` = `55.to.59.years` + `60.and.61.years` + `62.to.64.years`,
`65plus` = `65.to.69.years` + `70.to.74.years` + `75.years.and.over`) %>%
select(-(3:15)) %>%
melt() %>%
dcast(geoid ~ lm + variable)
# Next get the number of people that are employed, employed in the military, and unemployed
demo_frame3b <- demo_frame3 %>%
filter(!(sex == "Total." & is.na(age)) & !is.na(age) & !is.na(lf) & !is.na(occ1)) %>%
mutate(occ2 = ifelse(occ1 == "In.Armed.Forces", "EmployedMilitary", occ2)) %>%
select(-2, -6) %>%
filter(!is.na(occ2)) %>%
dcast(geoid + age + occ2 ~ sex + lf, value.var = 'value') %>%
mutate(inLaborForce = Female_In.labor.force + Male_In.labor.force) %>%
select(-(4:5)) %>%
melt() %>%
dcast(geoid + occ2 + variable ~ age) %>%
dplyr::rename(lm = variable) %>%
mutate(`16to19` = `16.to.19.years`,
`20to24` = `20.and.21.years` + `22.to.24.years`,
`25to54` = `25.to.29.years` + `30.to.34.years` + `35.to.44.years` + `45.to.54.years`,
`55to64` = `55.to.59.years` + `60.and.61.years` + `62.to.64.years`) %>%
select(-(3:13)) %>%
melt() %>%
dcast(geoid ~ occ2 + variable)
# Merge them all together to get the final dataset, add total population by age group
employ_final <- df_checktot %>%
left_join(demo_frame3a, by = "geoid") %>%
left_join(demo_frame3b, by = "geoid") %>%
mutate(total_16to19 = inLaborForce_16to19 + outLaborForce_16to19,
total_20to24 = inLaborForce_20to24 + outLaborForce_20to24,
total_25to54 = inLaborForce_25to54 + outLaborForce_25to54,
total_55to64 = inLaborForce_55to64 + outLaborForce_55to64,
total_65plus = inLaborForce_65plus + outLaborForce_65plus) %>%
select(1:2, 25:29, 3:24)
```
I did not calculate percentages because I am not sure what we want to use. However, it is important to understand what will sum and what will not:
1. inLaborMarket and outLaborMarket will sum to total
2. EmployedMilitary, Employed, and Unemployed will sum to inLaborMarket, except for 65 plus where data are not reported for employment (but is for in and out of the labor market)