-
Notifications
You must be signed in to change notification settings - Fork 0
/
converttowordcount.R
53 lines (50 loc) · 2.34 KB
/
converttowordcount.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#This is based on steps outlined in a [blog post by John Victor Anderson](http://johnvictoranderson.org/?p=115).
write.csv(tweets$tweettxt, 'tweetsastext.txt')
#Now we re-import that data as a character object using scan:
tweettext <- scan('tweetsastext.txt', what="char", sep=",")
# We convert all text to lower case to prevent any case sensitive issues with counting
tweettext <- tolower(tweettext)
#Repace quotes because each tweet starts and ends with one
tweettext <- gsub('"', '', tweettext)
#Replace new line code with a space
tweettext <- gsub('\n', ' ', tweettext)
#Unescape HTML - first activate the htmltools package
library(htmltools)
#Then run the htmlEscape function
tweettext <- htmltools::htmlEscape(tweettext)
#This doesn't seem to work 100%
#We now need to put this through a series of conversions before we can generate a table:
#Split the text on every space
tweettext.split <- strsplit(tweettext, " ")
#Create a vector
tweettextvec <- unlist(tweettext.split)
#Convert that to a table
tweettexttable <- table(tweettextvec)
#remove the objects created that we no longer need
rm(tweettext.split, tweettextvec)
#That table is enough to create a CSV from:
write.csv(tweettexttable, 'tweettexttable.csv')
#read it back in
tweetdata <- read.csv('tweettexttable.csv')
summary(tweetdata)
#rename the columns
colnames(tweetdata) <- c('index', 'word', 'freq' )
summary(tweetdata)
#Install the tidyverse package
library(tidyverse)
#Install tidytext which is needed for get_stopwords()
#install.packages("tidytext")
#Activate that - uncomment line above if you get an error here
library(tidytext)
#Use anti_join with stopwords fetched using get_stopwords to remove those stopwords from tweetdata and put in new object
cleaned_tweetdata <- tweetdata %>%
anti_join(get_stopwords())
cleaned_tweetdata$wordnopunc <- gsub(",","",cleaned_tweetdata$word)
cleaned_tweetdata$wordnopunc <- gsub("-","",cleaned_tweetdata$wordnopunc)
cleaned_tweetdata$wordnopunc <- gsub("!","",cleaned_tweetdata$wordnopunc)
cleaned_tweetdata$wordnopunc <- gsub("'","",cleaned_tweetdata$wordnopunc)
cleaned_tweetdata$wordnopunc <- gsub('"',"",cleaned_tweetdata$wordnopunc)
#This has to be escaped or it replaces all characters
cleaned_tweetdata$wordnopunc <- gsub("\\.","",cleaned_tweetdata$wordnopunc)
cleaned_tweetdata$wordnopunc <- gsub("\\?","",cleaned_tweetdata$wordnopunc)
cleaned_tweetdata$word2 <- NULL