In [None]:
renv::init()
renv::install("rvest")

In [None]:
html <- rvest::read_html("https://quotes.toscrape.com/")

In [None]:
quotes <- html |> 
  rvest::html_nodes(".text") |>
  rvest::html_text()

quotes

In [None]:
file_conn <- file("./data/citacoes.txt", open = "a")

In [None]:
url_pattern <- "https://quotes.toscrape.com/page/%d"
urls <- sprintf(url_pattern, 1:10)

header <- c("citacao", "autor")
writeLines(paste(header, collapse = ";"), file_conn, sep = "\n")

for(url in urls) {
  html <- rvest::read_html(url)

  quotes <- html |>
    rvest::html_nodes(".text") |>
    rvest::html_text()


  authors <- html |>
    rvest::html_nodes(".author") |>
    rvest::html_text()

  writeLines(paste0(quotes, ";", authors, collapse = "\n"), file_conn, sep = "")
  Sys.sleep(1)
}

In [None]:
close(file_conn)

In [None]:
html <- rvest::read_html("https://quotes.toscrape.com/search.aspx")

hidden_inputs <- html |>
  rvest::html_nodes("input[type='hidden']") |>
  (\(x) setNames(
    rvest::html_attr(x, "value"),
    rvest::html_attr(x, "name")
  ))()

response <- httr::POST(
  url = "https://quotes.toscrape.com/filter.aspx",
  httr::add_headers(
    `Content-Type` = "application/x-www-form-urlencoded"
  ),
  body = list(
    author = "Haruki Murakami",
    tag = "thought",
    `__VIEWSTATE` = hidden_inputs["__VIEWSTATE"]
  ),
  encode = "form"
)

In [None]:
quotes <- response |>
  rvest::read_html() |>
  rvest::html_nodes(".quote .content") |>
  rvest::html_text()

quotes

In [None]:
html <- rvest::read_html("https://quotes.toscrape.com/search.aspx")

authors <- html |>
  rvest::html_nodes("#author option") |>
  rvest::html_attr("value")

authors <- Filter(\(x) !is.na(x), authors)

hidden_inputs <- html |>
  rvest::html_nodes("input[type='hidden']") |>
  (\(x) setNames(
    rvest::html_attr(x, "value"),
    rvest::html_attr(x, "name")
  ))()

for(author in authors) {
  response <- httr::POST(
    url = "https://quotes.toscrape.com/filter.aspx",
    httr::add_headers(
      `Content-Type` = "application/x-www-form-urlencoded"
    ),
    body = list(
      author = author,
      `__VIEWSTATE` = hidden_inputs["__VIEWSTATE"]
    ),
    encode = "form"
  )

  tags <- response |>
    rvest::read_html() |>
    rvest::html_nodes("#tag option")  |>
    rvest::html_attr("value")

  for(tag in tags) {
    response <- httr::POST(
      url = "https://quotes.toscrape.com/filter.aspx",
      httr::add_headers(
        `Content-Type` = "application/x-www-form-urlencoded"
      ),
      body = list(
        author = author,
        tag = tag,
        `__VIEWSTATE` = hidden_inputs["__VIEWSTATE"]
      ),
      encode = "form"
    )

    quotes <- response |>
      rvest::read_html() |>
      rvest::html_nodes(".quote .content") |>
      rvest::html_text()

    quotes <- Filter(\(x) !identical(x, character(0)), quotes)
    cat(sprintf("Author: %s; Quotes %s; Tag: %s;\n", author, quotes, tag))
  }
}

In [None]:
session <- rvest::session("https://quotes.toscrape.com/login")

hidden_inputs <- session |>
  rvest::html_nodes("input[type='hidden']") |>
  (\(x) setNames(
    rvest::html_attr(x, "value"),
    rvest::html_attr(x, "name")
  ))()

hidden_inputs


In [None]:
form <- session |>
  rvest::html_form() |>
  (\(x) x[[1]])() |>
  rvest::html_form_set(
    username = "Username",
    password = "Password"
  )

session_2 <- rvest::session_submit(session, form)
session_2 |> rvest::session_jump_to("https://quotes.toscrape.com") |> rvest::read_html() |> rvest::html_text() |> cat()

In [None]:
login <- session |>
  rvest::html_form() |>
  (\(x) x[[1]])() |>
  rvest::html_form_set(
    username = "Username",
    password = "Password"
  )

logged_in <- session |> rvest::session_submit(login)

In [1]:
imdb_url <- "https://www.imdb.com/"
top_250_url <- "chart/top/"

top_250_html <- rvest::read_html(paste0(imdb_url, top_250_url))

In [2]:
movie_urls <- top_250_html |>
  rvest::html_nodes(".cli-children a") |>
  rvest::html_attr("href")

movie_urls

In [3]:
file_conn <- file("./data/citacoes.txt", open = "a")

In [4]:
header <- c("title", "rating", "score", "director")
writeLines(paste(header, collapse = ";"), file_conn, sep = "\n")

for (movie_url in movie_urls) {
  movie_html <- rvest::read_html(paste0(imdb_url, movie_url))

  title <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()

  rating <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()

  score <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()

  director <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()


  line <- c(title, rating, score, director)
  writeLines(paste(line, collapse = ";"), file_conn, sep = "\n")
  # Sys.sleep(1)
}

In [None]:
close(file_conn)