.Rhistory

knitr::opts_chunk$set(echo = TRUE,
warning = FALSE,
message = FALSE)
library(tidyverse)
library(here)
# For text mining:
library(pdftools)
library(tidytext)
library(textdata)
install.packages('textdata')
library(textdata)
library(ggwordcloud)
install.packages('ggwordcloud')
library(ggwordcloud)
get_sentiments(lexicon = "nrc")
ipcc_path <- here("data","ipcc_gw_15.pdf")
ipcc_text <- pdf_text(ipcc_path)
ipcc_p9 <- ipcc_text[9]
ipcc_p9
ipcc_df <- data.frame(ipcc_text) %>%
mutate(text_full = str_split(ipcc_text, pattern = '\\n')) %>%
unnest(text_full) %>%
mutate(text_full = str_trim(text_full))
# Why '\\n' instead of '\n'? Because some symbols (e.g. \, *) need to be called literally with a starting \ to escape the regular expression. For example, \\a for a string actually contains \a. So the string that represents the regular expression '\n' is actually '\\n'.
# More information: https://cran.r-project.org/web/packages/stringr/vignettes/regular-expressions.html
ipcc_tokens <- ipcc_df %>%
unnest_tokens(word, text_full)
# See how this differs from `ipcc_df`
# Each word has its own row!
View(ipcc_tokens)
ipcc_wc <- ipcc_tokens %>%
count(word) %>%
arrange(-n)
View(ipcc_wc)
ipcc_stop <- ipcc_tokens %>%
anti_join(stop_words) %>%
select(-ipcc_text)
View(ipcc_stop)
# This code will filter out numbers by asking:
# If you convert to as.numeric, is it NA (meaning those words)?
# If it IS NA (is.na), then keep it (so all words are kept)
# Anything that is converted to a number is removed
ipcc_no_numeric <- ipcc_stop %>%
filter(is.na(as.numeric(word)))
# There are almost 2000 unique words
length(unique(ipcc_no_numeric$word))
ipcc_top100 <- ipcc_no_numeric %>%
count(word) %>%
arrange(-n) %>%
head(100)
ipcc_cloud <- ggplot(data = ipcc_top100, aes(label = word)) +
geom_text_wordcloud() +
theme_minimal()
ipcc_cloud
ggplot(data = ipcc_top100, aes(label = word, size = n)) +
geom_text_wordcloud_area(aes(color = n), shape = "diamond") +
scale_size_area(max_size = 12) +
scale_color_gradientn(colors = c("darkgreen","blue","red")) +
theme_minimal()
library(tidyverse)
library(here)
# For text mining:
library(pdftools)
library(tidytext)
library(textdata)
library(ggwordcloud)
help <- here("data", "data/FVH_1888-1895A-K.pdf")
help_text <- pdf_text(help)
help <- here("data", "FVH_1888-1895A-K.pdf")
help_text <- pdf_text(help)
help3 <- help_text[3]
help3
knitr::opts_chunk$set(echo = TRUE,
warning = FALSE,
message = FALSE)
library(tidyverse)
library(here)
# For text mining:
library(pdftools)
library(tidytext)
library(textdata)
library(ggwordcloud)
# Note - Before lab:
# Attach tidytext and textdata packages
# Run: get_sentiments(lexicon = "nrc")
# Should be prompted to install lexicon - choose yes!
# Run: get_sentiments(lexicon = "afinn")
# Should be prompted to install lexicon - choose yes!
idig <- pdf_text('data/Boyd2021_OpenAreaOpenDataARAP.pdf')
idig_df <- data.frame(text = idig) %>%
mutate(page = 1:n())
idig_df
head(idig_df)
main_text <- idig_df %>%
filter(page %in% 1:20) %>%
mutate(text = str_split(text, '\n')) %>%
unnest(text)
idig_p9 <- idig[9]
idig_p9
idig_df <- data.frame(idig) %>%
mutate(text_full = str_split(idig, pattern = '\n')) %>%
unnest(text_full) %>%
mutate(text_full = str_trim(text_full))
idig_tokens <- idig_df %>%
unnest_tokens(word, text_full)
idig_wc <- idig_tokens %>%
count(word) %>%
arrange(-n)
idig_wc
idig_stop <- idig_tokens %>%
anti_join(stop_words) %>%
select(-idig)
idig_swc <- idig_stop %>%
count(word) %>%
arrange(-n)
idig_swc <- idig_stop %>%
count(word) %>%
arrange(-n)
idig_swc
# This code will filter out numbers by asking:
# If you convert to as.numeric, is it NA (meaning those words)?
# If it IS NA (is.na), then keep it (so all words are kept)
# Anything that is converted to a number is removed
idig_no_numeric <- idig_stop %>%
filter(is.na(as.numeric(word)))
# There are almost 2000 unique words
length(unique(idig_no_numeric$word))
# We probably don't want to include them all in a word cloud. Let's filter to only include the top 100 most frequent?
idig_top100 <- idig_no_numeric %>%
count(word) %>%
arrange(-n) %>%
head(100)
# There are almost 2000 unique words
length(unique(idig_no_numeric$word))
idig_top100
idig_cloud <- ggplot(data = idig_top100, aes(label = word)) +
geom_text_wordcloud() +
theme_minimal()
idig_cloud
ggplot(data = idig_top100, aes(label = word, size = n)) +
geom_text_wordcloud_area(aes(color = n), shape = "star") +
scale_size_area(max_size = 12) +
scale_color_gradientn(colors = c("darkgreen","blue","red")) +
theme_minimal()
ggplot(data = idig_top100, aes(label = word, size = n)) +
geom_text_wordcloud_area(aes(color = n), shape = "diamond") +
scale_size_area(max_size = 12) +
scale_color_gradientn(colors = c("darkgreen","blue","red")) +
theme_minimal()
get_sentiments(lexicon = "afinn")
# Note: may be prompted to download (yes)
# Let's look at the pretty positive words:
afinn_pos <- get_sentiments("afinn") %>%
filter(value %in% c(3,4,5))
# Do not look at negative words in class.
afinn_pos
get_sentiments(lexicon = "bing")
get_sentiments(lexicon = "nrc")
idig_afinn <- idig_stop %>%
inner_join(get_sentiments("afinn"))
idig_afinn_hist <- idig_afinn %>%
count(value)
# Plot them:
ggplot(data = idig_afinn_hist, aes(x = value, y = n)) +
geom_col()
# Plot them:
ggplot(data = idig_afinn_hist, aes(x = value, y = n)) +
geom_col(color = rainbow)
# Plot them:
ggplot(data = idig_afinn_hist, aes(x = value, y = n)) +
geom_col()
# What are these '-2' words?
idig_afinn2 <- idig_afinn %>%
filter(value == -2) %>%
head(50)
idig_afinn2
# Check the unique 2-score words:
unique(idig_afinn2$word)
# Count & plot them
idig_afinn2_n <- idig_afinn2 %>%
count(word, sort = TRUE) %>%
mutate(word = fct_reorder(factor(word), n))
ggplot(data = idig_afinn2_n, aes(x = word, y = n)) +
geom_col() +
coord_flip()
# OK most of these are pretty clear. Can you find any ambiguous terms?
idig_summary <- idig_afinn %>%
summarize(
mean_score = mean(value),
median_score = median(value)
)
idig_summary
idig_nrc <- idig_stop %>%
inner_join(get_sentiments("nrc"))
idig_exclude <- idig_stop %>%
anti_join(get_sentiments("nrc"))
# View(idig_exclude)
# Count to find the most excluded:
idig_exclude_n <- idig_exclude %>%
count(word, sort = TRUE)
head(idig_exclude_n)
idig_nrc_n <- idig_nrc %>%
count(sentiment, sort = TRUE)
# And plot them:
ggplot(data = idig_nrc_n, aes(x = sentiment, y = n)) +
geom_col()
idig_nrc_n5 <- idig_nrc %>%
count(word,sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(5) %>%
ungroup()
idig_nrc_gg <- ggplot(data = idig_nrc_n5, aes(x = reorder(word,n), y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, ncol = 2, scales = "free") +
coord_flip() +
theme_minimal() +
labs(x = "Word", y = "count")
# Show it
idig_nrc_gg
# Save it
ggsave(plot = idig_nrc_gg,
here("figures","idig_nrc_sentiment.png"),
height = 8,
width = 5)
lord <- get_sentiments(lexicon = "nrc") %>%
filter(word == "excavation")
dig <- get_sentiments(lexicon = "nrc") %>%
filter(word == "excavation")
# Yep, check it out:
dig
get_sentiments(lexicon = "nrc")
knitr::opts_chunk$set(echo = TRUE,
warning = FALSE,
message = FALSE)
library(tidyverse)
library(here)
# For text mining:
library(pdftools)
library(tidytext)
library(textdata)
library(ggwordcloud)
# Note - Before lab:
# Attach tidytext and textdata packages
# Run: get_sentiments(lexicon = "nrc")
# Should be prompted to install lexicon - choose yes!
# Run: get_sentiments(lexicon = "afinn")
# Should be prompted to install lexicon - choose yes!
get_sentiments(lexicon = "nrc")
get_sentiments(lexicon = "afinn")
ipcc_path <- here("data","ipcc_gw_15.pdf")
ipcc_text <- pdf_text(ipcc_path)
ipcc_text
ipcc_p9 <- ipcc_text[9]
ipcc_p9
ipcc_p9 <- ipcc_text[9]
ipcc_p9
ipcc_df <- data.frame(ipcc_text) %>%
mutate(text_full = str_split(ipcc_text, pattern = '\\n')) %>%
unnest(text_full) %>%
mutate(text_full = str_trim(text_full))
# Why '\\n' instead of '\n'? Because some symbols (e.g. \, *) need to be called literally with a starting \ to escape the regular expression. For example, \\a for a string actually contains \a. So the string that represents the regular expression '\n' is actually '\\n'.
# Although, this time round, it is working for me with \n alone. Wonders never cease.
# More information: https://cran.r-project.org/web/packages/stringr/vignettes/regular-expressions.html
ipcc_tokens <- ipcc_df %>%
unnest_tokens(word, text_full)
# See how this differs from `ipcc_df`
# Each word has its own row!
ipcc_tokens
ipcc_df
ipcc_df <- data.frame(ipcc_text) %>%
mutate(text_full = str_split(ipcc_text, pattern = '\\n')) %>%
unnest(text_full) %>%
mutate(text_full = str_trim(text_full))
# Why '\\n' instead of '\n'? Because some symbols (e.g. \, *) need to be called literally with a starting \ to escape the regular expression. For example, \\a for a string actually contains \a. So the string that represents the regular expression '\n' is actually '\\n'.
# Although, this time round, it is working for me with \n alone. Wonders never cease.
# More information: https://cran.r-project.org/web/packages/stringr/vignettes/regular-expressions.html
ipcc_df
ipcc_df <- data.frame(ipcc_text) %>%
mutate(text_full = str_split(ipcc_text, pattern = '\n')) %>%
unnest(text_full) %>%
mutate(text_full = str_trim(text_full))
ipcc_tokens <- ipcc_df %>%
unnest_tokens(word, text_full)
ipcc_df
# See how this differs from `ipcc_df`
# Each word has its own row!
ipcc_wc <- ipcc_tokens %>%
count(word) %>%
arrange(-n)
ipcc_wc
ipcc_tokens
ipcc_stop <- ipcc_tokens %>%
anti_join(stop_words) %>%
select(-ipcc_text)
ipcc_stop <- ipcc_tokens %>%
anti_join(stop_words) %>%
select(-ipcc_text)
ipcc_stop
ipcc_swc <- ipcc_stop %>%
count(word) %>%
arrange(-n)
ipcc_swc
ipcc_swc <- ipcc_stop %>%
count(word) %>%
arrange(-n)
ipcc_swc <- ipcc_stop %>%
count(word) %>%
arrange(-n)
ipcc_swc
# This code will filter out numbers by asking:
# If you convert to as.numeric, is it NA (meaning those words)?
# If it IS NA (is.na), then keep it (so all words are kept)
# Anything that is converted to a number is removed
ipcc_no_numeric <- ipcc_stop %>%
filter(is.na(as.numeric(word)))
# There are almost 2000 unique words
length(unique(ipcc_no_numeric$word))
# We probably don't want to include them all in a word cloud. Let's filter to only include the top 100 most frequent?
ipcc_top100 <- ipcc_no_numeric %>%
count(word) %>%
arrange(-n) %>%
head(100)
# There are almost 2000 unique words
length(unique(ipcc_no_numeric$word))
# We probably don't want to include them all in a word cloud. Let's filter to only include the top 100 most frequent?
ipcc_top100 <- ipcc_no_numeric %>%
count(word) %>%
arrange(-n) %>%
head(100)
ipcc_cloud <- ggplot(data = ipcc_top100, aes(label = word)) +
geom_text_wordcloud() +
theme_minimal()
ipcc_cloud
ggplot(data = ipcc_top100, aes(label = word, size = n)) +
geom_text_wordcloud_area(aes(color = n), shape = "star") +
scale_size_area(max_size = 12) +
scale_color_gradientn(colors = c("darkgreen","blue","red")) +
theme_minimal()
ggplot(data = ipcc_top100, aes(label = word, size = n)) +
geom_text_wordcloud_area(aes(color = n), shape = "diamond") +
scale_size_area(max_size = 12) +
scale_color_gradientn(colors = c("darkgreen","blue","red")) +
theme_minimal()
get_sentiments(lexicon = "afinn")
# Let's look at the pretty positive words:
afinn_pos <- get_sentiments("afinn") %>%
filter(value %in% c(3,4,5))
# Do not look at negative words in class.
afinn_pos
get_sentiments(lexicon = "bing")
get_sentiments(lexicon = "nrc")
ipcc_afinn <- ipcc_stop %>%
inner_join(get_sentiments("afinn"))
ipcc_afinn_hist <- ipcc_afinn %>%
count(value)
# Plot them:
ggplot(data = ipcc_afinn_hist, aes(x = value, y = n)) +
geom_col()
# What are these '2' words?
ipcc_afinn2 <- ipcc_afinn %>%
filter(value == 2)
# Check the unique 2-score words:
unique(ipcc_afinn2$word)
ipcc_afinn2
# Check the unique 2-score words:
unique(ipcc_afinn2$word)
# Count & plot them
ipcc_afinn2_n <- ipcc_afinn2 %>%
count(word, sort = TRUE) %>%
mutate(word = fct_reorder(factor(word), n))
ggplot(data = ipcc_afinn2_n, aes(x = word, y = n)) +
geom_col() +
coord_flip()
ipcc_summary <- ipcc_afinn %>%
summarize(
mean_score = mean(value),
median_score = median(value)
)
ipcc_summary
ipcc_nrc <- ipcc_stop %>%
inner_join(get_sentiments("nrc"))
ipcc_exclude <- ipcc_stop %>%
anti_join(get_sentiments("nrc"))
# View(ipcc_exclude)
# Count to find the most excluded:
ipcc_exclude_n <- ipcc_exclude %>%
count(word, sort = TRUE)
head(ipcc_exclude_n)
ipcc_nrc_n <- ipcc_nrc %>%
count(sentiment, sort = TRUE)
# And plot them:
ggplot(data = ipcc_nrc_n, aes(x = sentiment, y = n)) +
geom_col()
ipcc_nrc_n <- ipcc_nrc %>%
count(sentiment, sort = TRUE)
# And plot them:
ggplot(data = ipcc_nrc_n, aes(x = sentiment, y = n)) +
geom_col()
ipcc_nrc_n5 <- ipcc_nrc %>%
count(word,sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(5) %>%
ungroup()
ipcc_nrc_gg <- ggplot(data = ipcc_nrc_n5, aes(x = reorder(word,n), y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, ncol = 2, scales = "free") +
coord_flip() +
theme_minimal() +
labs(x = "Word", y = "count")
# Show it
ipcc_nrc_gg
# Save it
ggsave(plot = ipcc_nrc_gg,
here("figures","ipcc_nrc_sentiment.png"),
height = 8,
width = 5)
conf <- get_sentiments(lexicon = "nrc") %>%
filter(word == "confidence")
# Yep, check it out:
conf
knitr::opts_chunk$set(echo = TRUE)
if(!require("devtools")) install.packages("devtools")
devtools::install_github("Guscode/Sentida")
install.packages("devtools")
install.packages("devtools")
install.packages("devtools")
install.packages("devtools")
knitr::opts_chunk$set(echo = TRUE)
devtools::install_github("Guscode/Sentida")
devtools::install_github("Guscode/Sentida")
library(Sentida)
knitr::opts_chunk$set(echo = TRUE,
warning = FALSE,
message = FALSE)
library(tidyverse)
library(here)
# For text mining:
library(pdftools)
library(tidytext)
library(textdata)
library(ggwordcloud)
# Note - Before lab:
# Attach tidytext and textdata packages
# Run: get_sentiments(lexicon = "nrc")
# Should be prompted to install lexicon - choose yes!
# Run: get_sentiments(lexicon = "afinn")
# Should be prompted to install lexicon - choose yes!
text <- pdf_text("C:/Users/adela/Documents/RStudio/1_Teaching/AUDigitalArchives/output_data/1940_10minutes.txt")
text <- pdf_text("C:/Users/adela/Documents/Professional/Articles/Newest/2013_Book_AppliedSpatialDataAnalysisWith.pdf")
text <- pdf_text("C:/Users/adela/Documents/Professional/Articles/Newest/2013_Book_AppliedSpatialDataAnalysisWith.pdf")