# Web Scraping - 1

In [1]:
# Importação das Bibliotecas
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import requests
from requests_html import HTMLSession
import time

# Configurações de visualização do dataset
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 90)

In [2]:
# Solicitação dos conteúdos HTML da página 
session = HTMLSession()
response = session.get('https://www.scrapethissite.com/pages/simple/')

In [3]:
# Status da solicitação
response.status_code

200

In [4]:
# Analisar o documento HTML da página
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
# Visualização da página em formato HTML
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="A single page that lists information about all the countries in the world. Good for those just get started with web scraping." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
  <meta content="noindex

In [6]:
# Título da página
soup.title

<title>Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping</title>

In [7]:
# Apenas o texto do título da página
soup.title.string

'Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping'

In [8]:
# Tag head
soup.head

<head>
<meta charset="utf-8"/>
<title>Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="A single page that lists information about all the countries in the world. Good for those just get started with web scraping." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robots"/>
<link href="https://lipis.github.io/flag-

In [9]:
# Link
soup.a

<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>

In [10]:
# Links da página
soup.find_all('a')

[<a class="nav-link hidden-sm hidden-xs" href="/">
 <img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                 Scrape This Site
                             </a>,
 <a class="nav-link" href="/pages/">
 <i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                 Sandbox
                             </a>,
 <a class="nav-link" href="/lessons/">
 <i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                 Lessons
                             </a>,
 <a class="nav-link" href="/faq/">
 <i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                 FAQ
                             </a>,
 <a class="nav-link" href="/login/">
                                 Login
                             </a>,
 <a href="/lessons/">4 video lessons</a>,
 <a class="data-attribution" href="https://peric.github.io/GetCountries/" target="_blank">http://peric.github.io/GetCountr

In [11]:
# Extratir todas as URLs encontradas nas tags <a> da página
for link in soup.find_all('a'):
    print(link.get('href'))

/
/pages/
/lessons/
/faq/
/login/
/lessons/
https://peric.github.io/GetCountries/


In [12]:
# Web Scraping

paises = []
capitais = []
populacao_quant = []
area_quant = []

elementos = soup.findAll('div', class_='col-md-4 country')
for itens in range(len(elementos)):

    pais = elementos[itens].find('h3', class_= 'country-name').text.replace('\n', '').strip()
    capital = elementos[itens].find('span', class_= 'country-capital').text
    populacao = elementos[itens].find('span', class_= 'country-population').text
    area = elementos[itens].find('span', class_= 'country-area').text
    
    paises.append(pais)
    capitais.append(capital)
    populacao_quant.append(populacao)
    area_quant.append(area)

In [13]:
# DataFrame
df = pd.DataFrame({'País': paises, 'Capital': capitais, 'População': populacao_quant, 'Área': area_quant})

In [14]:
df

Unnamed: 0,País,Capital,População,Área
0,Andorra,Andorra la Vella,84000,468.0
1,United Arab Emirates,Abu Dhabi,4975593,82880.0
2,Afghanistan,Kabul,29121286,647500.0
3,Antigua and Barbuda,St. John's,86754,443.0
4,Anguilla,The Valley,13254,102.0
5,Albania,Tirana,2986952,28748.0
6,Armenia,Yerevan,2968000,29800.0
7,Angola,Luanda,13068161,1246700.0
8,Antarctica,,0,14000000.0
9,Argentina,Buenos Aires,41343201,2766890.0


In [15]:
# Salvar o DataFrame em formato CSV
df.to_csv('webscraping_1')