In [2]:
from bs4 import BeautifulSoup
import csv
import re
import os
import traceback
from datetime import datetime

# Define folder path
folder_path = os.getcwd()

# Define CSV output file
csv_file = "flight_data_final.csv"

def extract_duration(duration_text):
    """Convert duration from 'Xh Ym' format to total minutes."""
    if not duration_text:
        return None
    match = re.search(r'(\d+)h\s*(\d*)m?', duration_text)
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2)) if match.group(2) else 0
        return hours * 60 + minutes
    return None
print("Starting the extraction process...")

# Open CSV file for writing
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow([
        "Origin", "Destination", "TTT", "LOS", "Departure Time", "Departure Date", "Arrival Time",
        "Return Departure Time", "Return Date", "Return Arrival Time", "Onward Departure Airport",
        "Onward Arrival Airport", "Return Departure Airport", "Return Arrival Airport",
        "Onward Airlines", "Return Airlines", "Onward Flight Connections", "Onward Connection Duration(min)",
        "Onward Flight Connection Airport", "Return Flight Connections", "Return Connection Duration(min)",
        "Return Flight Connection Airport", "Carry-ons", "Checked Bags", "Price(NIS)", "Snapshot Time", "Site","Total Onward Flight Duration(min)","Total Return Flight Duration(min)"
    ])

    # Loop through all HTML files in the folder
    for filename in os.listdir(folder_path):
        if not filename.endswith(".html"):  # Ensure it's an HTML file
            continue

        print(f"Processing file: {filename}")  # Log file being processed

        file_path = os.path.join(folder_path, filename)
        creation_time = os.path.getctime(file_path)
        creation_date = datetime.fromtimestamp(creation_time).strftime("%d/%m/%Y")
        snapshot_time = creation_date
        # Extract information from the filename
        filename_pattern = re.match(
            r"([A-Za-z]+)_to_([A-Za-z]+)_TTT_(\d+)_LOS_(\d+)_(\d{2}\.\d{2}\.\d{2})-(\d{2}\.\d{2}\.\d{2})", 
            filename
        )

        if filename_pattern:
            origin = filename_pattern.group(1)
            destination = filename_pattern.group(2)
            ttt = filename_pattern.group(3)
            los = filename_pattern.group(4)
            departure_date = filename_pattern.group(5)  # Outbound date
            departure_date = datetime.strptime(departure_date, "%d.%m.%y").strftime("%d/%m/%Y")
            return_date = filename_pattern.group(6)  # Return date
            return_date = datetime.strptime(return_date, "%d.%m.%y").strftime("%d/%m/%Y") 
        else:
            origin = destination = ttt = los = departure_date = return_date = None

        # Load and parse HTML file
        with open(file_path, "r", encoding="utf-8") as html_file:
            soup = BeautifulSoup(html_file, "html.parser")

        # Find all flight result containers
        containers = soup.find_all("div", class_="Fxw9-result-item-container")
        print(f"Found {len(containers)} flight containers in {filename}")  # Log flight count
        flight_count=0
        for container in containers:
            if flight_count>=100:
                print(f"Reached 100 flights for {filename},skipping remaining")
                break
            try:
                flights = container.find_all("li", class_="hJSA-item")
                if len(flights) < 2:
                    print(f"Skipping {filename}: Not enough flights found.")
                    continue  # Skip if there aren't both outbound and return flights

                fee_boxes = container.find_all("div", class_="ac27-fee-box")

                carry_on = None
                checked_bags = None

                for box in fee_boxes:
                    aria_label = box.get("aria-label", "").lower()
                    inner_divs = box.find_all("div", class_="ac27-inner")
                    
                    if "carry-on" in aria_label and len(inner_divs) > 1:
                        carry_on = inner_divs[1].text.strip()
                    
                    if "checked bag" in aria_label and len(inner_divs) > 1:
                        checked_bags = inner_divs[1].text.strip()

                # Extract outbound & return flight details
                departure_time = arrival_time = onward_departure_airport = onward_arrival_airport = onward_airline = onward_stops = onward_stop_duration = onward_flight_con_airport = None
                return_departure_time = return_arrival_time = return_departure_airport = return_arrival_airport = return_airline = return_stops = return_stop_duration = return_flight_con_airport = None

                for i, flight in enumerate(flights):
                    # Extract Departure & Arrival Times
                    times = flight.find("div", class_="vmXl vmXl-mod-variant-large")
                    if times:
                        time_spans = [span.get_text(strip=True) for span in times.find_all("span") if span.get_text(strip=True) != "–"]
                        extracted_departure_time = time_spans[0] if len(time_spans) > 0 else None
                        extracted_arrival_time = time_spans[1] if len(time_spans) > 1 else None
                    else:
                        extracted_departure_time, extracted_arrival_time = None, None
                    

                    # Extract Airline Name
                    airline_img = flight.find("img", src=re.compile(r"provider-logos/airlines"))
                    extracted_airline = airline_img["alt"] if airline_img and airline_img.has_attr("alt") else None
                    
                    # Extract Stops
                    stops_elem = flight.find("span", class_="JWEO-stops-text")
                    stops_text = stops_elem.get_text(strip=True) if stops_elem else None
                    match = re.search(r"(\d+)", stops_text) if stops_text else None
                    extracted_stops = int(match.group(1)) if match else 0

                    # Extract Departure & Arrival Airports
                    airports = flight.find_all("span", class_="jLhY-airport-info")
                    extracted_departure_airport = airports[0].find("span").get_text(strip=True) if len(airports) > 0 else None
                    extracted_arrival_airport = airports[1].find("span").get_text(strip=True) if len(airports) > 1 else None

                    # Extract Connection Duration & Airport
                    connection_elem = flight.find("span", title=re.compile(r"layover"))
                    extracted_connection_duration = connection_elem["title"].split(" layover")[0] if connection_elem else None
                    extracted_connection_airport = connection_elem.get_text(strip=True) if connection_elem else None
                    extracted_connection_duration=extract_duration(extracted_connection_duration)
                    # Assign extracted values to outbound or return flight
                    if i == 0:  # Outbound
                        departure_time, arrival_time = extracted_departure_time, extracted_arrival_time
                        onward_departure_airport, onward_arrival_airport = extracted_departure_airport, extracted_arrival_airport
                        onward_airline = extracted_airline
                        onward_stops = extracted_stops
                        onward_stop_duration = extracted_connection_duration
                        onward_flight_con_airport = extracted_connection_airport

                    else:  # Return
                        return_departure_time, return_arrival_time = extracted_departure_time, extracted_arrival_time
                        return_departure_airport, return_arrival_airport = extracted_departure_airport, extracted_arrival_airport
                        return_airline = extracted_airline
                        return_stops = extracted_stops
                        return_stop_duration = extracted_connection_duration
                        return_flight_con_airport = extracted_connection_airport

                onward_duration_min = None
                return_duration_min = None
                
                duration_elements = container.find_all("div", class_="vmXl vmXl-mod-variant-default")
                if len(duration_elements) >= 2:
                    onward_duration_min = extract_duration(duration_elements[1].get_text(strip=True))
                    return_duration_min = extract_duration(duration_elements[3].get_text(strip=True))
                # Extract Price
                price_elem = container.find("div", class_="f8F1-price-text")
                price = re.sub(r"[^\d]", "", price_elem.get_text(strip=True)) if price_elem else None

                print(f"Writing to CSV: Departure {departure_time}, Arrival {arrival_time}, Price {price}")

                # Append the final data row
                writer.writerow([
                    origin, destination, ttt, los, 
                    departure_time, departure_date, arrival_time, 
                    return_departure_time, return_date, return_arrival_time, 
                    onward_departure_airport, onward_arrival_airport, 
                    return_departure_airport, return_arrival_airport, 
                    onward_airline, return_airline, 
                    onward_stops, onward_stop_duration, onward_flight_con_airport,
                    return_stops, return_stop_duration, return_flight_con_airport,
                    carry_on, checked_bags, price, snapshot_time, "Kayak",onward_duration_min,return_duration_min
                ])
                flight_count+=1
            except Exception as e:
                print(f"Error in {filename}: {traceback.format_exc()}")

print(f"All flight data successfully saved to {csv_file}")

Starting the extraction process...
Processing file: Paris_to_Rome_TTT_10_LOS_1_20.03.25-21.03.25.html
Found 114 flight containers in Paris_to_Rome_TTT_10_LOS_1_20.03.25-21.03.25.html
Writing to CSV: Departure 17:10, Arrival 19:05, Price 613
Writing to CSV: Departure 06:20, Arrival 23:15, Price 482
Writing to CSV: Departure 17:10, Arrival 19:05, Price 620
Writing to CSV: Departure 17:10, Arrival 19:05, Price 613
Skipping Paris_to_Rome_TTT_10_LOS_1_20.03.25-21.03.25.html: Not enough flights found.
Skipping Paris_to_Rome_TTT_10_LOS_1_20.03.25-21.03.25.html: Not enough flights found.
Writing to CSV: Departure 17:10, Arrival 19:05, Price 638
Writing to CSV: Departure 07:40, Arrival 09:40, Price 638
Writing to CSV: Departure 07:40, Arrival 09:40, Price 638
Writing to CSV: Departure 12:45, Arrival 14:40, Price 652
Writing to CSV: Departure 16:30, Arrival 18:25, Price 660
Writing to CSV: Departure 09:45, Arrival 11:50, Price 645
Writing to CSV: Departure 12:45, Arrival 14:40, Price 652
Writing