forked from jfbratt/WebScraping
-
Notifications
You must be signed in to change notification settings - Fork 3
/
ECscraping.R
135 lines (115 loc) · 4.22 KB
/
ECscraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
library(rvest)
library(magrittr)
library(httr)
library(stringr)
library(XML)
library(selectr)
library(RCurl)
library(jsonlite)
#Reading in a CSV
url <- read.csv(file="EC2014.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
url <- as.data.frame(url)
#Calculate length of variable url
length <- nrow(url)
#Instantiate both check and error lists
checkList <- c()
errorList <- c()
#Loop through to identify the div class containing the post
for (i in 1:length)
{ tryCatch({
ID <- paste(i, ".txt", sep="") # Create ID file name
urlTest <- url_success(url[i,5]) #test URL for success
html <- html(httr::GET(url[i,5])) #convert URL into SourceCode
GET(handle=handle(url[i,5]))
wp <- html_node(html, ".entry-content")
bl <- html_node(html, ".post-body")
ptp <- html_node(html, ".post")
bnow <- html_node(html,".content")
med <- html_node(html,".graf--p") #Not Working..
rwhit <- html_node(html,".panel-body")
ejohn <- html_node(html, ".postentry")
cbro <- html_node(html, ".post-content")
cflib <- html_node(html,".entry")
tmun <- html_node(html, ".entry-wrapper")
asimp <- html_node(html, ".note") # app.simplenote.com
dmlc <- html_node(html, ".primary") #digital media + learning Competition
npr <- html_node(html, ".storytext") #NPR
if (!is.null(wp))
{
#WordPress Blogs
temptext <- wp %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if(!is.null(bl))
{
#Blogger Blogs
temptext <- bl %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(ptp))
{
#PlayThePast Blog
temptext <- ptp %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(bnow))
{
#Beth Nowviskie Blog
temptext <- bnow %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(med))
{
#Medium Blog
temptext <- med %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if(!is.null(rwhit))
{
#Roger Whitson Blog
temptext <- rwhit %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(ejohn))
{
#John Resig Blog
temptext <- ejohn %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(cbro))
{
# Blog
temptext <- cbro %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(cflib))
{
#Code4Library Blog
temptext <- cflib %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(tmun))
{
#Trevor Munoz Blog
temptext <- tmun %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(asimp))
{
#App.Simplenote
temptext <- asimp %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else if (!is.null(dmlc))
{
#Digital Media and Learning
temptext <- dmlc %>% html_text()
write(temptext, file = ID, ncolumns=1, append=FALSE)
cat("Printing ID:", ID, "\n")
} else
{
checkList<- append(checkList, url[i,5])
cat("CheckList - Adding ", ID, "\n")
}
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}