In [1]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
from datetime import datetime, timedelta
import csv
import re


In [2]:
def sum_time_strings(time1, time2):
    # Convert hh:mm strings to hours and minutes
    h1, m1 = map(int, time1.split(":"))
    h2, m2 = map(int, time2.split(":"))

    # Sum the hours and minutes separately
    total_minutes = m1 + m2
    total_hours = h1 + h2 + (total_minutes // 60)  # Carry over extra minutes to hours
    total_minutes %= 60  # Keep minutes within 0-59

    # Format result as hh:mm
    return f"{total_hours:02}:{total_minutes:02}"

In [3]:
def extract_layover_time(title_text):
    match = re.search(r"(\d+)h\s*(\d+)?m?", title_text)
    if match:
        hours = int(match.group(1)) if match.group(1) else 0
        minutes = int(match.group(2)) if match.group(2) else 0
        return hours * 60 + minutes  # Return total minutes as an integer
    return 0  # Default to 0 minutes if no match

In [4]:
def text_to_time_converter(time_str):
    # Extract hours and minutes using regex
    match = re.search(r'(\d+)h\s*(\d+)?m?', time_str)
    
    if not match:
        return "Invalid time format"  # Handle cases where time is missing

    hours = int(match.group(1))
    minutes = int(match.group(2)) if match.group(2) else 0  # Default minutes to 0 if missing

    # Return total minutes as an integer
    return hours * 60 + minutes

In [5]:
def convert_if_number(value):
    str_value = str(value)  # Ensure the input is treated as a string
    if str_value.lstrip('-').isdigit():  # Check if it's an integer (including negatives)
        return int(str_value)
    return value  # Return the original value if it's not a number

In [6]:
def convert_to_24hr_format(time_str):
    # Remove any timezone offset (e.g. +0800) and convert to lowercase
    time_str = re.sub(r'\+\d+', '', time_str).strip().lower()

    # Determine if the time is in PM
    is_pm = 'pm' in time_str
    
    # Remove 'am' or 'pm' from the string
    time_str = time_str.replace(' am', '').replace(' pm', '').strip()
    
    # Convert to datetime object
    time_obj = datetime.strptime(time_str, '%I:%M')
    
    # If PM and not 12 PM, add 12 hours
    if is_pm and time_obj.hour != 12:
        time_obj = time_obj.replace(hour=time_obj.hour + 12)
    
    # If 12 AM (midnight), convert it to 00:MM format
    if not is_pm and time_obj.hour == 12:
        time_obj = time_obj.replace(hour=0)

    return time_obj.strftime('%H:%M')  # Convert to 24-hour format (HH:MM)

In [7]:
#cookie clearer in case of captcha
async def handle_captcha(page):
    try:
        captcha_element = await page.query_selector("//div[contains(@class, 'WZTU-wrap')]")
        if captcha_element and await captcha_element.is_visible():
            print("Captcha detected")

            await page.context.clear_cookies()  # Clear cookies to reset captcha
            await page.reload()  # Refresh the page to reset captcha

            print("Please solve the CAPTCHA")
            input("Press Enter AFTER completing captcha and clicking continue...")
            return True
    except Exception as e:
        print(f"Error: {e}")
    return False # No captcha detected

In [None]:
nest_asyncio.apply()

async def scrape_multiple_routes():
    routes = [
        ("LON", "PAR"),
        ("PAR", "LON"),
        ("LON", "ROM"),
        ("ROM", "LON"),
        ("PAR", "ROM"),
        ("ROM", "PAR"),
    ]


    all_flights_data = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()

        today = datetime.now()
        counter = 0

        for origin, destination in routes:
            for ttt in range(1, 31):
                for los in range(1, 6):
                    try:
                        departure_date = today + timedelta(days=ttt)
                        return_date = departure_date + timedelta(days=los)
                        results_url = (
                            f"https://www.momondo.com/flight-search/{origin}-{destination}/"
                            f"{departure_date.strftime('%Y-%m-%d')}/{return_date.strftime('%Y-%m-%d')}?return=true"
                        )
                        await page.context.clear_cookies()
                        print(f"Scraping route: {origin} -> {destination} (TTT={ttt}, LOS={los})")

                        # if Captcha detected, handle it
                        if await handle_captcha(page):
                            print("Resuming")
                        
                        await page.goto(results_url, timeout=60000)
                        await page.wait_for_load_state("load", timeout=60000)

                        flight_cards = page.locator("div.nrc6-wrapper")
                        count = await flight_cards.count()
                        print(f"Total flight cards found: {count}")

                        while(count < 100):
                            await page.locator('div.ULvh-button.show-more-button').click()
                            flight_cards = page.locator("div.nrc6-wrapper")
                            count = await flight_cards.count()
                            print(f"Total flight cards found: {count}")
                            await page.wait_for_timeout(2000)
                        await page.wait_for_timeout(5000)

                        for i in range(100):
                            try:

                                # זמני המראה ונחיתה
                                departure_time_element = flight_cards.nth(i).locator('div.vmXl.vmXl-mod-variant-large span')
                                departure_time = await departure_time_element.nth(0).text_content() if await departure_time_element.count() > 0 else "Unknown"
                                if departure_time != "Unknown":
                                    departure_clean = convert_to_24hr_format(departure_time)
                                    # print("dep time: " + departure_clean)

                                arrival_time_element = flight_cards.nth(i).locator('div.vmXl.vmXl-mod-variant-large span')
                                arrival_time = await arrival_time_element.nth(2).text_content() if await arrival_time_element.count() > 0 else "Unknown"
                                if arrival_time != "Unknown":
                                    arrival_clean = convert_to_24hr_format(arrival_time)
                                    # print("arrival time: " + arrival_clean)

                                return_departure_time_element = flight_cards.nth(i).locator('div.vmXl.vmXl-mod-variant-large').nth(1).locator('span')
                                return_departure_time = await return_departure_time_element.nth(0).text_content() if await return_departure_time_element.count() > 0 else "Unknown"
                                if return_departure_time != "Unknown":
                                    return_departure_clean = convert_to_24hr_format(return_departure_time)
                                    # print("return dep time: " + return_departure_clean)

                                return_arrival_time_element = flight_cards.nth(i).locator('div.vmXl.vmXl-mod-variant-large').nth(1).locator('span')
                                return_arrival_time = await return_arrival_time_element.nth(2).text_content() if await return_arrival_time_element.count() > 0 else "Unknown"
                                if return_arrival_time != "Unknown":
                                    return_arrival_clean = convert_to_24hr_format(return_arrival_time)
                                    # print("return arrival time: " + return_arrival_clean)

                                # שדות תעופה להלוך וחזור
                                airport_elements = flight_cards.nth(i).locator('span.jLhY-airport-info')
                                airport_count = await airport_elements.count()
                                if airport_count >= 4:
                                    onward_departure_airport_element = airport_elements.nth(0).locator('span')
                                    onward_departure_airport = await onward_departure_airport_element.text_content()

                                    onward_arrival_airport_element = airport_elements.nth(1).locator('span')
                                    onward_arrival_airport = await onward_arrival_airport_element.text_content()

                                    return_departure_airport_element = airport_elements.nth(2).locator('span')
                                    return_departure_airport = await return_departure_airport_element.text_content()

                                    return_arrival_airport_element = airport_elements.nth(3).locator('span')
                                    return_arrival_airport = await return_arrival_airport_element.text_content()
                                else:
                                    onward_departure_airport = "Unknown"
                                    onward_arrival_airport = "Unknown"
                                    return_departure_airport = "Unknown"
                                    return_arrival_airport = "Unknown"
                                # print(f"Onward Departure: {onward_departure_airport}, Onward Arrival: {onward_arrival_airport}")
                                # print(f"Return Departure: {return_departure_airport}, Return Arrival: {return_arrival_airport}")


                                # חברות תעופה להלוך ולחזור
                                airlines_elements = flight_cards.nth(i).locator('div.c_cgF.c_cgF-mod-variant-default')
                                airlines_count = await airlines_elements.count()
                                if airlines_count >= 10:
                                    onward_airline_element = airlines_elements.nth(0)
                                    onward_airline = await onward_airline_element.text_content()

                                    return_airline_element = airlines_elements.nth(5)
                                    return_airline = await return_airline_element.text_content()
                                else:
                                    onward_airline = "Unknown"
                                    return_airline = "Unknown"
                                # print(f"Onward Airline: {onward_airline}, Return Airline: {return_airline}")


                                # סוג הטיסה (הלוך וחזור)
                                stops = flight_cards.nth(i).locator('span.JWEO-stops-text')
                                has_stops_area = flight_cards.nth(i).locator('div.c_cgF.c_cgF-mod-variant-default')

                                onward_stops_element = stops.nth(0)
                                onward_stops = await onward_stops_element.text_content() if await onward_stops_element.count() > 0 else "Unknown"

                                # מקרה פרטי בעייתי - פרסומת
                                if onward_stops == "1 change":
                                    onward_stops = "nonstop"

                                if onward_stops == "1 stop":
                                    onward_stops_int = 1
                                    has_stops_element1 = has_stops_area.nth(1).locator('span[title]')
                                    onward_stop_duration = await has_stops_element1.get_attribute("title") if await has_stops_element1.count() > 0 else "Unknown"
                                    onward_stop_duration = extract_layover_time(onward_stop_duration)

                                    has_stops_element2 = has_stops_area.nth(1).locator(':scope > span')
                                    onward_flight_con_airport = await has_stops_element2.text_content() if await has_stops_element2.count() > 0 else "Unknown"
                                elif onward_stops == "2 stops":
                                    onward_stops_int = 2
                                    has_stops_element1_1 = has_stops_area.nth(1).locator('span[title]').nth(0)
                                    onward_stop_duration1 = await has_stops_element1_1.get_attribute("title") if await has_stops_element1_1.count() > 0 else "Unknown"
                                    onward_stop_duration1 = extract_layover_time(onward_stop_duration1)
                                    has_stops_element1_2 = has_stops_area.nth(1).locator('span[title]').nth(1)
                                    onward_stop_duration2 = await has_stops_element1_2.get_attribute("title") if await has_stops_element1_2.count() > 0 else "Unknown"
                                    onward_stop_duration2 = extract_layover_time(onward_stop_duration2)
                                    onward_stop_duration = onward_stop_duration1 + onward_stop_duration2

                                    has_stops_element2_1 = has_stops_area.nth(1).locator(':scope > span').first
                                    onward_flight_con_airport1 = await has_stops_element2_1.text_content() if await has_stops_element2_1.count() > 0 else "Unknown"
                                    has_stops_element2_2 = has_stops_area.nth(1).locator(':scope > span').nth(1)
                                    onward_flight_con_airport2 = await has_stops_element2_2.text_content() if await has_stops_element2_2.count() > 0 else "Unknown"
                                    onward_flight_con_airport = onward_flight_con_airport1 + onward_flight_con_airport2
                                else:
                                    onward_stops_int = 0
                                    onward_flight_con_airport = None
                                    onward_stop_duration = 0
                                
                                return_stops_element = stops.nth(1)
                                return_stops = await return_stops_element.text_content() if await return_stops_element.count() > 0 else "Unknown"

                                # מקרה פרטי בעייתי - פרסומת
                                if return_stops == "1 change":
                                    return_stops = "nonstop"

                                if return_stops == "1 stop":
                                    return_stops_int = 1
                                    has_stops_element1 = has_stops_area.nth(6).locator('span[title]')
                                    return_stop_duration = await has_stops_element1.get_attribute("title") if await has_stops_element1.count() > 0 else "Unknown"
                                    return_stop_duration = extract_layover_time(return_stop_duration)

                                    has_stops_element2 = has_stops_area.nth(6).locator(':scope > span')
                                    return_flight_con_airport = await has_stops_element2.text_content() if await has_stops_element2.count() > 0 else "Unknown"
                                elif return_stops == "2 stops":
                                    return_stops_int = 2
                                    has_stops_element1_1 = has_stops_area.nth(6).locator('span[title]').nth(0)
                                    return_stop_duration1 = await has_stops_element1_1.get_attribute("title") if await has_stops_element1_1.count() > 0 else "Unknown"
                                    return_stop_duration1 = extract_layover_time(return_stop_duration1)
                                    has_stops_element1_2 = has_stops_area.nth(6).locator('span[title]').nth(1)
                                    return_stop_duration2 = await has_stops_element1_2.get_attribute("title") if await has_stops_element1_2.count() > 0 else "Unknown"
                                    return_stop_duration2 = extract_layover_time(return_stop_duration2)
                                    return_stop_duration = return_stop_duration1 + return_stop_duration2

                                    has_stops_element2_1 = has_stops_area.nth(6).locator(':scope > span').first
                                    return_flight_con_airport1 = await has_stops_element2_1.text_content() if await has_stops_element2_1.count() > 0 else "Unknown"
                                    has_stops_element2_2 = has_stops_area.nth(6).locator(':scope > span').nth(1)
                                    return_flight_con_airport2 = await has_stops_element2_2.text_content() if await has_stops_element2_2.count() > 0 else "Unknown"
                                    return_flight_con_airport = return_flight_con_airport1 + return_flight_con_airport2
                                else:
                                    return_stops_int = 0
                                    return_flight_con_airport = None
                                    return_stop_duration = 0
                                # print(f"Onward Flight Connections: {onward_stops} - {onward_flight_con_airport}, Return Flight Connections: {return_stops} - {return_flight_con_airport}")
                                # print("Onward Connection Duration: " + str(onward_stop_duration) + ", Return Connection Duration: " + str(return_stop_duration))


                                # נתוני כבודה
                                baggage_breakdown = flight_cards.nth(i).locator('div.ac27')
                                try:
                                    carry_on_element = baggage_breakdown.locator('div > div').nth(1)
                                    carry_on = await carry_on_element.text_content() if await carry_on_element.count() > 0 else 0
                                    carry_on_test = convert_if_number(carry_on)
                                    if (isinstance(carry_on_test, int)):
                                        num_of_carrys = carry_on_test
                                    else:
                                        num_of_carrys = 0

                                    checked_bag_element = baggage_breakdown.locator('div').nth(1).locator('div').nth(1)
                                    checked_bag = await checked_bag_element.text_content() if await checked_bag_element.count() > 0 else 0
                                    checked_bag_test = convert_if_number(checked_bag)
                                    if (isinstance(checked_bag_test, int)):
                                        num_of_checkedbags = checked_bag_test
                                    else:
                                        num_of_checkedbags = 0


                                except Exception as e:
                                    print(f"Error collecting baggage information for flight {i + 1}: {e}")
                                    carry_on = "Unknown"
                                    checked_bag = "Unknown"
                                # print(f"For flight {i+1}: Carry-ons: {num_of_carrys}, Checked Bags: {num_of_checkedbags}")

                                
                                # מחיר
                                price_element = flight_cards.nth(i).locator('div.f8F1-price-text')
                                price_text = await price_element.text_content() if await price_element.count() > 0 else "Unknown"
                                price_usd = float(price_text.replace("$", "").replace(",", "").strip())
                                price_nis = round(price_usd * 3.7)
                                print(f"Flight number {i+1} price: " + str(price_nis))


                                # הוספת שדה Snapshot
                                snapshot_time = datetime.now().strftime("%d/%m/%Y")
                                # print(f"Snapshot Time: {snapshot_time}")


                                if (origin == "LON"):
                                    full_origin = "London"
                                elif (origin == "PAR"):
                                    full_origin = "Paris"
                                elif (origin == "ROM"):
                                    full_origin = "Rome"
                                else:
                                    full_origin = None

                                if (destination == "LON"):
                                    full_destination = "London"
                                elif (destination == "PAR"):
                                    full_destination = "Paris"
                                elif (destination == "ROM"):
                                    full_destination = "Rome"
                                else:
                                    full_destination = None


                                # זמן טיסה כולל - הלוך
                                total_onward_time = flight_cards.nth(i).locator('div.xdW8').nth(0)
                                total_onward_time_text = await total_onward_time.text_content() if await total_onward_time.count() > 0 else "Unknown"
                                total_onward_time_clean = text_to_time_converter(total_onward_time_text)
                                # print(f"Flight number {i+1} total Onward Flight Time: {total_onward_time_clean}")


                                # זמן טיסה כולל - חזור
                                total_return_time = flight_cards.nth(i).locator('div.xdW8').nth(1)
                                total_return_time_text = await total_return_time.text_content() if await total_return_time.count() > 0 else "Unknown"
                                total_return_time_clean = text_to_time_converter(total_return_time_text)
                                # print(f"Flight number {i+1} total Return Flight Time: {total_return_time_clean}")


                                flight_data = {
                                    'Origin': full_origin,
                                    'Destination': full_destination,
                                    'TTT': ttt,
                                    'LOS': los,
                                    'Departure Time': departure_clean,
                                    'Departure Date': departure_date.strftime("%d/%m/%Y"),
                                    'Arrival Time': arrival_clean,
                                    'Return Departure Time': return_departure_clean,
                                    'Return Date': return_date.strftime("%d/%m/%Y"),
                                    'Return Arrival Time': return_arrival_clean,
                                    'Onward Departure Airport': onward_departure_airport,
                                    'Onward Arrival Airport': onward_arrival_airport,
                                    'Return Departure Airport': return_departure_airport,
                                    'Return Arrival Airport': return_arrival_airport,
                                    'Onward Airlines': onward_airline,
                                    'Return Airlines': return_airline,
                                    'Onward Flight Connections': onward_stops_int,
                                    'Onward Connection Duration(min)': onward_stop_duration,
                                    'Onward Flight Connection Airport': onward_flight_con_airport,
                                    'Return Flight Connections': return_stops_int,
                                    'Return Connection Duration(min)': return_stop_duration,
                                    'Return Flight Connection Airport': return_flight_con_airport,
                                    'Carry-ons': num_of_carrys,
                                    'Checked Bags': num_of_checkedbags,
                                    'Price(NIS)': price_nis,
                                    'Snapshot Time': snapshot_time,
                                    'Site': "Momondo",
                                    'Total Onward Flight Duration(min)': total_onward_time_clean,
                                    'Total Return Flight Duration(min)': total_return_time_clean
                                }

                                all_flights_data.append(flight_data)
                                counter += 1


                            except Exception as e:
                                print(f"Error collecting flight card {i + 1}: {e}")

                    except Exception as e:
                        print(f"Error occurred for route {origin} -> {destination} (TTT={ttt}, LOS={los}): {e}")
                        continue
                    print(f"\nTotal flights across all routes: {counter}")            
        await browser.close()

        print(f"\nTotal flights across all routes: {len(all_flights_data)}")
        if all_flights_data:
            csv_file = "flights_data.csv"
            with open(csv_file, mode="w", newline="", encoding="utf-8-sig") as f:
                writer = csv.DictWriter(f, fieldnames=all_flights_data[0].keys())
                writer.writeheader()
                writer.writerows(all_flights_data)

            print(f"Flight data saved to {csv_file}")

await scrape_multiple_routes()

Scraping route: PAR -> ROM (TTT=14, LOS=3)
Total flight cards found: 0
Total flight cards found: 16
Total flight cards found: 31
Total flight cards found: 46
Total flight cards found: 61
Total flight cards found: 76
Total flight cards found: 91
Total flight cards found: 106
Flight number 1 price: 625
Flight number 2 price: 618
Flight number 3 price: 625
Flight number 4 price: 625
Flight number 5 price: 625
Flight number 6 price: 625
Flight number 7 price: 625
Flight number 8 price: 536
Flight number 9 price: 644
Flight number 10 price: 648
Flight number 11 price: 648
Flight number 12 price: 648
Flight number 13 price: 559
Flight number 14 price: 662
Flight number 15 price: 662
Flight number 16 price: 574
Flight number 17 price: 662
Flight number 18 price: 574
Flight number 19 price: 688
Flight number 20 price: 684
Flight number 21 price: 596
Flight number 22 price: 688
Flight number 23 price: 688
Flight number 24 price: 692
Flight number 25 price: 692
Flight number 26 price: 696
Flight