In [6]:
from time import sleep  # Allows the program to pause for a specified amount of time
import pandas as pd  # Provides data manipulation and analysis tools
from selenium import webdriver  # Allows automated web browsing
from bs4 import BeautifulSoup  # Parses HTML and XML documents
from selenium.webdriver.common.by import By  # Provides a way to locate elements on a webpage
from selenium.webdriver.support.ui import WebDriverWait  # Allows the program to wait for an element to load before continuing
from selenium.webdriver.support import expected_conditions as EC  # Allows the program to specify expected conditions for an element to load
from datetime import date  # Provides tools for working with dates
from unidecode import unidecode  # Allows the program to remove accents and other diacritical marks from characters
import random  # Provides tools for generating random numbers and sequences
import datetime  # Provides tools for working with dates and times
import requests


In [7]:
def scrape_decolar_data(from_location, to_location, departure_date, arrival_date, adult_qty):
    # Initialize search day, search ID, and company name
    search_day = date.today()
    search_id = str(search_day) + from_location + to_location + departure_date + arrival_date
    company = 'Decolar'

    # Set up headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Set up the search URL
    search_url = f'https://www.decolar.com/shop/flights/results/roundtrip/{from_location}/{to_location}/{departure_date}/{arrival_date}/1/0/0/NA/NA/NA/NA/NA?from=SB&di={adult_qty}-0&reSearch=true'

    # Fetch the HTML content of the page
    response = requests.get(search_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract flight containers
        flight_containers = soup.find_all('div', class_='cluster-container COMMON')

        flight_list = []

        for elementSoup in flight_containers:
            # Dictionary to store the flight card data
            flight_data = {}

            # SearchID, SearchDay, and searchUrl
            flight_data['searchID'] = search_id
            flight_data['searchDay'] = search_day
            flight_data['searchUrl'] = search_url
            flight_data['departureDate'] = departure_date
            flight_data['arrivalDate'] = arrival_date
            flight_data['adultQty'] = adult_qty
            flight_data['company'] = company

            # Origin Airport and Destiny Airport
            flight_data['originAirport'] = elementSoup.find('span', class_='route-location route-departure-location').text.strip().split(' ', 1)[0]
            flight_data['destinyAirport'] = elementSoup.find('span', class_='route-location route-arrival-location').text.strip().split(' ', 1)[0]

            # Value Tarif and Taxes
            flight_data['tarif'] = 0
            flight_data['taxes'] = 0

            # Currency and Price
            flight_data['currency'] = elementSoup.find('span', class_='currency price-mask -eva-3-mr-xsm').text
            flight_data['value'] = elementSoup.find('span', class_='amount price-amount').text.replace('.', '').strip()

            # CIA and CIA_abv
            airline_img_container = elementSoup.find('span', class_='container-img-airlines')
            list_cia_flight_container = []  # List to store the companies name, because some of them have more than 1
            list_cia_abv_flight_container = []  # List to store the abbreviated companies name, because some of them have more than 1
            for img in airline_img_container.find_all('img'):
                list_cia_flight_container.append(unidecode(img['alt'].strip()))
                list_cia_abv_flight_container.append(unidecode(img['alt'].strip()[:4]))
            flight_data['cia'] = list_cia_flight_container
            flight_data['cia_abv'] = list_cia_abv_flight_container

            flight_list.append(flight_data)

        dataTypeDict = {
            "searchID": 'object', 'searchDay': 'datetime64[ns]', "originAirport": 'object',
            "destinyAirport": 'object',
            "searchUrl": 'object', "departureDate": 'datetime64[ns]', "arrivalDate": 'datetime64[ns]',
            "adultQty": 'int64',
            "company": 'object', "cia": 'object', 'cia_abv': 'object', "currency": 'object', "tarif": 'float32',
            "taxes": 'float32',
            "value": 'float32'}

        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(flight_list)

        return df

    else:
        print(f"Failed to fetch data from {search_url}. Status code: {response.status_code}")
        return None

# Example usage
from_location = 'GRU'
to_location = 'CDG'
departure_date = '2022-08-01'
arrival_date = '2022-08-15'
adult_qty = 1

result_df = scrape_decolar_data(from_location, to_location, departure_date, arrival_date, adult_qty)
print(result_df)


Failed to fetch data from https://www.decolar.com/shop/flights/results/roundtrip/GRU/CDG/2022-08-01/2022-08-15/1/0/0/NA/NA/NA/NA/NA?from=SB&di=1-0&reSearch=true. Status code: 403
None


In [8]:
from_location = 'GRU'
to_location = 'JFK'
departure_date = '2023-08-01'
arrival_date = '2023-08-10'
adult_qty = '1'

decolar_df = scrape_decolar_data(from_location, to_location, departure_date, arrival_date, adult_qty)
decolar_df.head()

Failed to fetch data from https://www.decolar.com/shop/flights/results/roundtrip/GRU/JFK/2023-08-01/2023-08-10/1/0/0/NA/NA/NA/NA/NA?from=SB&di=1-0&reSearch=true. Status code: 403


AttributeError: 'NoneType' object has no attribute 'head'