/
SenatePolls.Rmd
81 lines (69 loc) · 2.6 KB
/
SenatePolls.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
---
title: "SenatePolls"
author: "Colin Wu"
date: "10/20/2020"
output:
html_document:
toc: yes
toc_float: yes
code_folding: hide
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(rvest)
library(dplyr)
library(tibble)
```
```{r}
#Get table of dates of Senate Primaries
polls <- as_tibble(read.csv("senate_polls - 10-27-2020.csv"))
print(polls)
polls %>% filter(candidate_party %in% c("REP", "DEM"), stage == "general") -> polls
primary_site <- read_html("https://ballotpedia.org/United_States_Senate_elections,_2020")
primary_site %>%
html_nodes("table") %>%
.[[20]] %>% html_table(fill=TRUE) ->
primary_table
colnames(primary_table) <- make.names(primary_table[1,])
primary_table[-1, 1:4] %>%
mutate(
Primary.date = as.Date(Primary.date, format='%m/%d/%Y'),
Primary.runoff.date = as.Date(Primary.runoff.date, format='%m/%d/%Y'),
Primary.date = ifelse(is.na(Primary.runoff.date), Primary.date, Primary.runoff.date),
Primary.date = as.Date(Primary.date, origin = "1970-01-01")
) %>% select(State, Primary.date) %>%
rename(state = State) ->
primary_table
#Remove polls that were taken before the primary
polls %>%
right_join(primary_table, polls, by = "state") %>%
mutate(
start_date = as.Date(start_date, format='%m/%d/%y'),
) %>%
filter(Primary.date < start_date) ->
polls
#Add DEM and REP candidate percentages for the same poll on the same row
polls %>%
filter(candidate_party == "REP") %>%
select(pct, answer, state, poll_id, sample_size, question_id, url, candidate_party, pollster, methodology, population, population_full, notes, start_date, created_at) %>%
rename(Rep_Pct = pct, Rep_Candidate = answer) ->
rep_polls
polls %>%
filter(candidate_party == "DEM") %>%
rename(Dem_Pct = pct, Dem_Candidate = answer) ->
dem_polls
left_join(dem_polls, rep_polls, by = c("poll_id", "question_id", "state", "sample_size", "url", "pollster", "methodology", "population","population_full", "notes", "start_date", "created_at")) %>%
select(state, pollster, url, start_date, end_date, sample_size, population_full, Dem_Pct, Rep_Pct) %>%
mutate(start_date = format(start_date, "%m/%d/%Y"), ) %>%
rename(c(State = state, Poll = pollster, Source = url, "Start Date" = start_date, "End Date" = end_date, "Sample Size"= sample_size, "Sample Type" = population_full, "Biden" = Dem_Pct, "Trump" = Rep_Pct)) ->
final_polls
#Georgia and Arizona have a special Senate election, might remove later
```
# Datasets
```{r}
print(primary_table)
print(rep_polls)
print(dem_polls)
print(final_polls)
write.csv(final_polls, "revised_polls.csv", row.names = FALSE)
```