forked from jfbratt/WebScraping
-
Notifications
You must be signed in to change notification settings - Fork 3
/
cleantext.R
43 lines (34 loc) · 1 KB
/
cleantext.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
library("readr")
library("dplyr")
library("rvest")
library("stringr")
data <- read_csv("dhnow-unfiltered-2015-05-19.csv")
data <- data %>%
mutate(content_length = str_length(entry_content)) %>%
mutate(text_to_use = ifelse(content_length > 0,
entry_content,
entry_summary)) %>%
mutate(text_length = str_length(text_to_use)) %>%
filter(text_length > 0)
cleaning <- function(dirtytext) {
plain <- try(
dirtytext %>%
html() %>%
html_text()
)
if (!class(plain) == "try-error") {
plain %>%
str_replace_all("\n", " ") %>%
str_replace_all("view in full screen", "")
} else {
dirtytext %>%
str_replace_all("\n", " ") %>%
str_replace_all("view in full screen", "")
}
}
if(!dir.exists("inputs")) dir.create("inputs")
clean_texts <- data$text_to_use %>%
lapply(cleaning)
for (i in seq_len(length(clean_texts))) {
writeLines(clean_texts[[i]], str_c("inputs/", i, ".txt"))
}