# Exemplos `rvest`

In [None]:
renv::install()

## Exemplos de extracão de citacões (https://quotes.toscrape.com/)

Utilizar o `rvest` para relizar a requisicão e capturar o `html` da página

In [None]:
html <- rvest::read_html("https://quotes.toscrape.com/")

In [None]:
quotes <- html |> 
  rvest::html_nodes(".text") |>
  rvest::html_text()

quotes[1:2]

Abrindo uma conexão de arquivo para salvar os resultados

In [None]:
file_conn <- file("../../data/rvest/citacoes_10_paginas.txt", open = "a")

In [None]:
# Padrão de url das citacões
url_pattern <- "https://quotes.toscrape.com/page/%d"
urls <- sprintf(url_pattern, 1:10)

# Criando `header` do arquivo csv
header <- c("author", "quote")
writeLines(paste(header, collapse = ";"), file_conn, sep = "\n")

for(url in urls) {
  # Requisicão do html
  html <- rvest::read_html(url)

  authors <- html |>
    rvest::html_nodes(".author") |>
    rvest::html_text()

  quotes <- html |>
    rvest::html_nodes(".text") |>
    rvest::html_text()

  writeLines(paste0(authors, ";", quotes, collapse = "\n"), file_conn, sep = "")
  Sys.sleep(1)
}

Fechar arquivo

In [None]:
close(file_conn)

## Exemplos de extracão de citacões com preenchimento de formulário (https://quotes.toscrape.com/search.aspx)

##### Exemplo único

In [None]:
html <- rvest::read_html("https://quotes.toscrape.com/search.aspx")

hidden_inputs <- html |>
  rvest::html_nodes("input[type='hidden']") |>
  (\(x) setNames(
    rvest::html_attr(x, "value"),
    rvest::html_attr(x, "name")
  ))()

response <- httr::POST(
  url = "https://quotes.toscrape.com/filter.aspx",
  httr::add_headers(
    `Content-Type` = "application/x-www-form-urlencoded"
  ),
  body = list(
    author = "Haruki Murakami",
    tag = "thought",
    `__VIEWSTATE` = hidden_inputs["__VIEWSTATE"]
  ),
  encode = "form"
)

In [None]:
quotes <- response |>
  rvest::read_html() |>
  rvest::html_nodes(".quote .content") |>
  rvest::html_text()

quotes

##### Exemplo vários

In [None]:
html <- rvest::read_html("https://quotes.toscrape.com/search.aspx")

authors <- html |>
  rvest::html_nodes("#author option") |>
  rvest::html_attr("value")

authors <- Filter(\(x) !is.na(x), authors)

hidden_inputs <- html |>
  rvest::html_nodes("input[type='hidden']") |>
  (\(x) setNames(
    rvest::html_attr(x, "value"),
    rvest::html_attr(x, "name")
  ))()

In [None]:
file_conn <- file("../../data/rvest/citacoes_filtro.txt", open = "a")

In [None]:
header <- c("author", "quotes", "tag")
writeLines(paste(header, collapse = ";"), file_conn, sep = "\n")

for(author in authors) {
  response <- httr::POST(
    url = "https://quotes.toscrape.com/filter.aspx",
    httr::add_headers(
      `Content-Type` = "application/x-www-form-urlencoded"
    ),
    body = list(
      author = author,
      `__VIEWSTATE` = hidden_inputs["__VIEWSTATE"]
    ),
    encode = "form"
  )

  tags <- response |>
    rvest::read_html() |>
    rvest::html_nodes("#tag option")  |>
    rvest::html_attr("value")

  for(tag in tags) {
    response <- httr::POST(
      url = "https://quotes.toscrape.com/filter.aspx",
      httr::add_headers(
        `Content-Type` = "application/x-www-form-urlencoded"
      ),
      body = list(
        author = author,
        tag = tag,
        `__VIEWSTATE` = hidden_inputs["__VIEWSTATE"]
      ),
      encode = "form"
    )

    quotes <- response |>
      rvest::read_html() |>
      rvest::html_nodes(".quote .content") |>
      rvest::html_text()

    quotes <- Filter(\(x) !identical(x, character(0)), quotes)

    for (quote in quotes) {
      line = c(author, quote, tag)
      writeLines(paste0(line, collapse = ";"), file_conn, sep = "\n")
    }

  }
}

In [None]:
close(file_conn)

## Exemplos de extracão de citacões com login (https://quotes.toscrape.com/login)

In [None]:
rvest::read_html("https://quotes.toscrape.com") |>
  rvest::html_text() |>
  cat()

In [None]:
session <- rvest::session("https://quotes.toscrape.com/login")

hidden_inputs <- session |>
  rvest::html_nodes("input[type='hidden']") |>
  (\(x) setNames(
    rvest::html_attr(x, "value"),
    rvest::html_attr(x, "name")
  ))()

hidden_inputs


In [None]:
form <- session |>
  rvest::html_form() |>
  (\(x) x[[1]])() |>
  rvest::html_form_set(
    username = "Username",
    password = "Password"
  )

session <- rvest::session_submit(session, form)
session |>
  rvest::session_jump_to("https://quotes.toscrape.com") |>
  rvest::read_html() |>
  rvest::html_text() |>
  cat()

## Exemplos de extracão de Top 250 filmes do IMDB (https://www.imdb.com/chart/top)

In [None]:
imdb_url <- "https://www.imdb.com/"
top_250_url <- "chart/top"

top_250_html <- rvest::read_html(paste0(imdb_url, top_250_url))

In [None]:
movie_urls <- top_250_html |>
  rvest::html_nodes(".cli-children a") |>
  rvest::html_attr("href")

movie_urls[1:5]

In [None]:
file_conn <- file("../../data/rvest/top_250_filmes.txt", open = "a")

In [None]:
header <- c("title", "rating", "score", "director")
writeLines(paste(header, collapse = ";"), file_conn, sep = "\n")

for (movie_url in movie_urls) {
  movie_html <- rvest::read_html(paste0(imdb_url, movie_url))

  title <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()

  rating <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()

  score <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()

  director <- movie_html |>
    rvest::html_nodes("h1") |>
    rvest::html_text()

  line <- c(title, rating, score, director)
  writeLines(paste(line, collapse = ";"), file_conn, sep = "\n")
  Sys.sleep(1)
}

In [None]:
close(file_conn)