<a href="https://colab.research.google.com/github/Denis718/Web-Scraping-Python-BS4/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Web Scraping com Python e Beautiful Soup

<h2>Conteúdo</h2>

*   Inspecionar a estrutura HTML do site de destino com as ferramentas de desenvolvedor do navegador
*   Decifrar dados codificados em URLs
*   Usar *request* e *Beautiful Soup* para extrair e analisar dados da Web
*    Percorrer um pipeline de web scraping do início ao fim
*   Criar um script que busque ofertas de emprego na Web e exiba informações relevantes no console

In [None]:
import requests

URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

print(page.text)

In [None]:
!pip install beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup

URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

print(page.content)

b'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <title>Fake Python</title>\n    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css">\n  </head>\n  <body>\n  <section class="section">\n    <div class="container mb-5">\n      <h1 class="title is-1">\n        Fake Python\n      </h1>\n      <p class="subtitle is-3">\n        Fake Jobs for Your Web Scraping Journey\n      </p>\n    </div>\n    <div class="container">\n    <div id="ResultsContainer" class="columns is-multiline">\n    <div class="column is-half">\n<div class="card">\n  <div class="card-content">\n    <div class="media">\n      <div class="media-left">\n        <figure class="image is-48x48">\n          <img src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1" alt="Real Python Logo">\n        </figure>\n      </div>\n      <div class="media-content

In [None]:
results = soup.find(id="ResultsContainer")

In [None]:
print(results.prettify()) # .prettify() - método utilizado para formatar o conteúdo

In [None]:
# identificando a tag do conteúdo desejado
job_elements = results.find_all("div", class_="card-content")

In [None]:
for job_element in job_elements:
  print(job_element, end="\n"*2)

In [None]:
# separando os elementos para apresentar o resultado
for job_element in job_elements:
  title_element = job_element.find("h2", class_="title")
  company_element = job_element.find("h3", class_="company")
  location_element = job_element.find("p", class_="location")
  print(title_element)
  print(company_element)
  print(location_element)
  print()

In [None]:
# exibindo somente o texto de cada elemento
for job_element in job_elements:
  title_element = job_element.find("h2", class_="title")
  company_element = job_element.find("h3", class_="company")
  location_element = job_element.find("p", class_="location")
  print(title_element.text) # .text - Exibe somente o texto de cada elemento HTML
  print(company_element.text)
  print(location_element.text)
  print()

In [None]:
# eliminando o excesso de espaços do texto
for job_element in job_elements:
  title_element = job_element.find("h2", class_="title")
  company_element = job_element.find("h3", class_="company")
  location_element = job_element.find("p", class_="location")
  print(title_element.text.strip()) # .strip() - Remove os espaços em branco supérfluos
  print(company_element.text.strip())
  print(location_element.text.strip())
  print()

In [None]:
# Filtrar pela vaga de emprego de interesse- Desenvolvedor Python
python_jobs = results.find_all("h2", string="Python")

In [None]:
print(python_jobs) # não encontra pela string, devido a procura de texto exato

In [None]:
# utilização de lambda para extrair o texto com qualquer formatação
python_jobs = results.find_all("h2", string = lambda text: "python" in text.lower())

In [None]:
print(len(python_jobs)) # total de jobs encontrados

10


In [None]:
print(python_jobs[0].text) # selecionando apenas um elementos da lista

Senior Python Developer


In [None]:
python_jobs = results.find_all(
    "h2", string = lambda text: "python" in text.lower()
    )

# acessando elemento pai para extrair demais informações das vagas desejadas
python_job_elements = [
    h2_element.parent.parent.parent for h2_element in python_jobs
]

In [None]:
# tentando capturar os links das vagas, porém selecionado o texto
for job_element in python_job_elements:
  #print(job_element.text)
  links = job_element.find_all("a")
  for link in links:
    print(link.text.strip())

In [None]:
# capturando somente o link de aplicação da vaga
for job_element in python_job_elements:
  #print(job_element.text)
  links = job_element.find_all("a")
  #for link in links:
  link_url = links[1]["href"]
  print(f"Apply here: {link_url}\n")

In [None]:
# escrevendo um arquivo CSV com os resultados gerados
import csv

header = ['Job-Title', 'Company', 'Location', 'Link']
dados = []

for job_element in python_job_elements:
  title_element = job_element.find("h2", class_="title")
  company_element = job_element.find("h3", class_="company")
  location_element = job_element.find("p", class_="location")
  link = job_element.find_all("a")[1]["href"]
  dados.append([title_element.text.strip(), 
                company_element.text.strip(), 
                location_element.text.strip(), 
                link]
               )
  
with open("Vagas_Python.csv", "w", newline="") as vagas_py_csv:
  writer = csv.writer(vagas_py_csv, delimiter = ";")
  writer.writerow(header)
  writer.writerows(dados)

In [None]:
#lendo o arquivo .csv gerado

with open('/content/Vagas_Python.csv', 'r') as jobs_py:
  job_reader = csv.reader(jobs_py, delimiter = ";")
  for row in job_reader:
    print(row)

['Job-Title', 'Company', 'Location', 'Link']
['Senior Python Developer', 'Payne, Roberts and Davis', 'Stewartbury, AA', 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html']
['Software Engineer (Python)', 'Garcia PLC', 'Ericberg, AE', 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html']
['Python Programmer (Entry-Level)', 'Moss, Duncan and Allen', 'Port Sara, AE', 'https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-20.html']
['Python Programmer (Entry-Level)', 'Cooper and Sons', 'West Victor, AE', 'https://realpython.github.io/fake-jobs/jobs/python-programmer-entry-level-30.html']
['Software Developer (Python)', 'Adams-Brewer', 'Brockburgh, AE', 'https://realpython.github.io/fake-jobs/jobs/software-developer-python-40.html']
['Python Developer', 'Rivera and Sons', 'East Michaelfort, AA', 'https://realpython.github.io/fake-jobs/jobs/python-developer-50.html']
['Back-End Web Developer (Python, Django)', 'Stewart-Al