[Reference](https://medium.com/@datajournal/scrape-google-flights-4b0cde1621ee)

# Create and Activate a Virtual Environment
```
# Create a virtual environment
python -m venv flights-scraper-env
# Activate the virtual environment
# On Windows:
.\flights-scraper-env\Scripts\activate
# On macOS/Linux:
source flights-scraper-env/bin/activate
```

# Install Necessary Packages

In [1]:
# Install required packages
!pip install playwright tenacity asyncio
# Install Playwright's browser dependencies
!playwright install chromium

Collecting playwright
  Downloading playwright-1.48.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting asyncio
  Downloading asyncio-3.4.3-py3-none-any.whl.metadata (1.7 kB)
Collecting pyee==12.0.0 (from playwright)
  Downloading pyee-12.0.0-py3-none-any.whl.metadata (2.8 kB)
Downloading playwright-1.48.0-py3-none-manylinux1_x86_64.whl (38.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-12.0.0-py3-none-any.whl (14 kB)
Downloading asyncio-3.4.3-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: asyncio, pyee, playwright
Successfully installed asyncio-3.4.3 playwright-1.48.0 pyee-12.0.0


Downloading Chromium 130.0.6723.31 (playwright build v1140)[2m from https://playwright.azureedge.net/builds/chromium/1140/chromium-linux.zip[22m
[1G164.5 MiB [] 0% 0.0s[0K[1G164.5 MiB [] 0% 63.8s[0K[1G164.5 MiB [] 0% 17.9s[0K[1G164.5 MiB [] 0% 10.5s[0K[1G164.5 MiB [] 0% 7.4s[0K[1G164.5 MiB [] 1% 6.2s[0K[1G164.5 MiB [] 1% 5.6s[0K[1G164.5 MiB [] 2% 5.1s[0K[1G164.5 MiB [] 2% 4.9s[0K[1G164.5 MiB [] 2% 4.8s[0K[1G164.5 MiB [] 3% 5.2s[0K[1G164.5 MiB [] 3% 13.1s[0K[1G164.5 MiB [] 3% 13.4s[0K[1G164.5 MiB [] 3% 12.7s[0K[1G164.5 MiB [] 3% 12.0s[0K[1G164.5 MiB [] 4% 11.1s[0K[1G164.5 MiB [] 4% 11.0s[0K[1G164.5 MiB [] 4% 10.4s[0K[1G164.5 MiB [] 5% 9.7s[0K[1G164.5 MiB [] 5% 9.2s[0K[1G164.5 MiB [] 6% 8.6s[0K[1G164.5 MiB [] 6% 8.1s[0K[1G164.5 MiB [] 7% 7.7s[0K[1G164.5 MiB [] 7% 7.4s[0K[1G164.5 MiB [] 7% 7.3s[0K[1G164.5 MiB [] 8% 6.9s[0K[1G164.5 MiB [] 9% 6.7s[0K[1G164.5 MiB [] 9% 6.6s[0K[1G164.5 MiB [] 9% 6.4s[0K[1G164.5 MiB [] 10% 6.2s[0K[

# Define Data Structures

In [1]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class SearchParameters:
    departure: str
    destination: str
    departure_date: str
    return_date: Optional[str] = None
    ticket_type: str = "One way"

@dataclass
class FlightData:
    airline: str
    departure_time: str
    arrival_time: str
    duration: str
    stops: str
    price: str
    co2_emissions: str
    emissions_variation: str

# Crafting the Flight Scraper Class

## Define CSS Selectors

In [2]:
class FlightScraper:
    SELECTORS = {
        "airline": "div.sSHqwe.tPgKwe.ogfYpf",
        "departure_time": 'span[aria-label^="Departure time"]',
        "arrival_time": 'span[aria-label^="Arrival time"]',
        "duration": 'div[aria-label^="Total duration"]',
        "stops": "div.hF6lYb span.rGRiKd",
        "price": "div.FpEdX span",
        "co2_emissions": "div.O7CXue",
        "emissions_variation": "div.N6PNV",
    }

## Simulate Filling Out the Search Form

In [3]:
async def _fill_search_form(self, page, params: SearchParameters) -> None:
    ticket_type_div = page.locator("div.VfPpkd-TkwUic[jsname='oYxtQd']").first
    await ticket_type_div.click()
    await page.locator("li").filter(has_text=params.ticket_type).nth(0).click()
    from_input = page.locator("input[aria-label='Where from?']")
    await from_input.fill(params.departure)
    to_input = page.locator("input[aria-label='Where to?']")
    await to_input.fill(params.destination)
    date_input = page.locator("input[aria-label='Departure date']")
    await date_input.fill(params.departure_date)

## Load All Available Flights

In [4]:
async def _load_all_flights(self, page) -> None:
    while True:
        try:
            more_button = await page.wait_for_selector(
            'button[aria-label*="more flights"]', timeout=5000
            )
            if more_button:
                await more_button.click()
                await page.wait_for_timeout(2000)
            else:
                break
        except:
            break

## Extract Flight Data

In [5]:
async def _extract_flight_data(self, page) -> list[FlightData]:
    await page.wait_for_selector("li.pIav2d", timeout=30000)
    flights = await page.query_selector_all("li.pIav2d")
    flights_data = []
    for flight in flights:
        flight_info = {}
    for key, selector in self.SELECTORS.items():
        element = await flight.query_selector(selector)
        flight_info[key] = await self._extract_text(element)
        flights_data.append(FlightData(**flight_info))
    return flights_data

# Implement Retry Logic for Reliability

In [6]:
from tenacity import retry, stop_after_attempt, wait_fixed
@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
async def search_flights(self, params: SearchParameters) -> list[FlightData]:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto("https://www.google.com/flights")
        await self._fill_search_form(page, params)
        flights = await self._extract_flight_data(page)
        await browser.close()
        return flights

# Save Results to a JSON File

In [7]:
import json
from datetime import datetime

def save_results(self, flights: list[FlightData], params: SearchParameters) -> str:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"flight_results_{params.departure}_{params.destination}_{timestamp}.json"
    output_data = {
    "search_parameters": vars(params),
    "flights": [vars(flight) for flight in flights],
    }
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    return filename

# Running the Scraper

In [9]:
import asyncio
async def main():
    scraper = FlightScraper()
    params = SearchParameters(
    departure="LAX",
    destination="JFK",
    departure_date="2024–12–01",
    ticket_type="One way"
    )

    try:
        flights = await scraper.search_flights(params)
        scraper.save_results(flights, params)
        print("Flights scraped successfully.")
    except Exception as e:
        print(f"Error during flight search: {str(e)}")

if __name__ == "__main__":
    asyncio.run(main())