***chekpoint*** 

In [1]:
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import re
from datetime import datetime
import time

# Define the function to click the "Load earlier flights" button until it disappears or after a certain number of attempts
def click_load_earlier_until_disappear(driver, max_attempts=5):
    for _ in range(max_attempts):
        try:
            button = driver.find_element(By.XPATH, '//button[contains(@class, "btn-flights-load") and contains(text(), "Load earlier flights")]')
            driver.execute_script("arguments[0].click();", button)
            time.sleep(2)  # Adjust the sleep time according to your needs
        except (NoSuchElementException, ElementNotInteractableException):
            break

# Define the function to click the "Load later flights" button until it disappears or after a certain number of attempts
def click_load_later_until_disappear(driver, max_attempts=5):
    for _ in range(max_attempts):
        try:
            button = driver.find_element(By.XPATH, '//button[contains(@class, "btn-flights-load") and contains(text(), "Load later flights")]')
            driver.execute_script("arguments[0].click();", button)
            time.sleep(2)  # Adjust the sleep time according to your needs
        except (NoSuchElementException, ElementNotInteractableException):
            break

def save_data_to_csv(data, airport_name, current_date):
    # Remove invalid characters from the airport name
    airport_name = re.sub(r'[\\/:"*?<>|]+', '', airport_name)
    
    # Create folder path with date attached
    folder_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}'  # Change this to the desired folder path
    os.makedirs(folder_path, exist_ok=True)  # Create directory if it doesn't exist
    
    # Create file path with airport name and date attached
    file_path = os.path.join(folder_path, f"Departures_{airport_name}_{current_date}.csv")  # Add .csv extension
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

def get_data_with_date(data,time_difference):
    current_year = datetime.now().year
    
    # Initialize variables to store the last encountered date
    last_date = ''
    last_date_with_year = ''
    
    # Iterate through the data to append date with year as needed
    for row in data:
        if row[0].startswith(('Monday,', 'Tuesday,', 'Wednesday,', 'Thursday,', 'Friday,', 'Saturday,', 'Sunday,')):
            last_date = row[0]
            last_date_with_year = last_date + " " + str(current_year)
        elif last_date_with_year:  # Append date with year if it exists
            row.append(last_date_with_year)
            row.append(time_difference)

    
    return data



def scrape_departures(airport_list, num_rows=None):
    # Determine the number of rows to iterate based on the provided parameter
    if num_rows is not None:
        airport_list = airport_list.head(num_rows)
    
    # Get current date
    current_date = datetime.now().strftime('%Y-%m-%d')
    
    # Checkpoint file path
    checkpoint_file_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}/DEPcheckpoint.csv'
    
    # Check if checkpoint file exists
    if os.path.isfile(checkpoint_file_path):
        checkpoint_df = pd.read_csv(checkpoint_file_path)
        processed_airports = set(checkpoint_df['Name'])
    else:
        processed_airports = set()
    
    # options = webdriver.EdgeOptions()
    # options.add_argument('--headless')
 
   
    # Loop through the specified number of rows in the airport list
    for _, airport in airport_list.iterrows():
        if airport['Name'] in processed_airports:
            continue
        
        data = []
        
        # Append '/departures' to the link
        departure_link = airport['Link'] + '/departures'
        options = webdriver.EdgeOptions()
        options.add_argument('--headless')
 
        driver = webdriver.Edge()
        driver.get(departure_link)
        
        time.sleep(4)

        # Wait for the button to appear
        try:
            button = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
            driver.execute_script("arguments[0].click();", button)
        except NoSuchElementException:
            pass

        time.sleep(3)
        
        # Call the function to repeat the process 20 times for loading earlier flights
        click_load_earlier_until_disappear(driver)
        
        # Call the function to repeat the process 20 times for loading later flights
        click_load_later_until_disappear(driver)

        # Get the HTML content of the page after loading all flights
        html = driver.page_source
        
        # Parse the HTML content
        soup = BeautifulSoup(html, 'html.parser')
        
        # Initialize UTC and Local time strings
        UTC = ''
        Local = ''

        # Regular expression pattern to match the time format HH:MM
        time_pattern = re.compile(r'\d{2}:\d{2}')

        # Loop until the Local time matches the time format
        while not time_pattern.match(Local):
            # Fetch UTC and Local time elements here
            UTC_elements = soup.find('span', class_='text-base')
            if UTC_elements:
                UTC = UTC_elements.text.strip()

            Local_elements = soup.find('span', class_='clock-time ng-binding')
            if Local_elements:
                Local = Local_elements.text.strip()
            
            time.sleep(2)
                
        local_time = datetime.strptime(Local,"%H:%M")
        # Parse UTC time string into datetime object with a dummy date
        utc_time = datetime.strptime(UTC,"%H:%M")
        time_difference = utc_time - local_time

        # Find the table element
        table = soup.find('table', class_='table table-condensed table-hover data-table m-n-t-15')
    
        # Extract data from the table
        if table:
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if cells:
                    # Ensure consistent structure of each row
                    row_data = [cell.text.strip() for cell in cells]
                    # If row has fewer fields, add empty strings to match the expected number of fields
                    if len(row_data) < 7:
                        row_data.extend([''] * (7 - len(row_data)))
                    row_data.append(airport['Name'])  # Append airport name to each row
                    data.append(row_data)
        
        driver.quit()
        
        # Process data to add date with date
        data = get_data_with_date(data,time_difference)
        
        # Save data to CSV file for the current airport
        save_data_to_csv(data, airport['Name'], current_date)
        
        # Update processed airports set
        processed_airports.add(airport['Name'])
        
        # Save checkpoint
        checkpoint_df = pd.DataFrame({'Name': list(processed_airports)})
        checkpoint_df.to_csv(checkpoint_file_path, index=False)

# Read the CSV file into a DataFrame
airport_list = pd.read_csv("airport_list.csv", header=None, names=["Name", "Link"])

# Specify the number of rows to scrape (if desired), or leave it as None to scrape all rows
num_rows_to_scrape = None # Change this to the desired number of rows, or set it to None

# Call the function with the airport_list and the specified number of rows
scrape_departures(airport_list, num_rows_to_scrape)


KeyboardInterrupt: 

In [10]:
from confluent_kafka import Producer
import json

# Kafka Producer configuration
producer_config = {
    'bootstrap.servers': 'localhost:9092'
}

def produce_to_kafka(data_df):
    p = Producer(producer_config)

    def delivery_report(err, msg):
        """ Called once for each message produced to indicate delivery result.
            Triggered by poll() or flush(). """
        if err is not None:
            print('Message delivery failed: {}'.format(err))
        else:
            print('Message delivered to {} [{}]'.format(msg.topic(), msg.partition()))

    for _, row in data_df.iterrows():
        # Convert row to JSON
        json_data = row.to_json()

        # Trigger any available delivery report callbacks from previous produce() calls
        p.poll(0)

        # Asynchronously produce a message.
        p.produce('Departures', json_data.encode('utf-8'), callback=delivery_report)

    # Wait for any outstanding messages to be delivered and delivery report callbacks to be triggered.
    p.flush()

# After getting airport_data_with_date from scraping
produce_to_kafka(airport_data_with_date)


Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departures [0]
Message delivered to Departu

In [9]:
from confluent_kafka import Consumer, KafkaError,KafkaException
import json

# Kafka Consumer configuration
consumer_config = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'my_consumer_group',
    'auto.offset.reset': 'earliest'
}

def consume_and_save_to_json(file_path):
    c = Consumer(consumer_config)

    # Subscribe to the 'Arrivals' topic
    c.subscribe(['Departures'])

    # List to hold received data
    received_data = []

    try:
        while True:
            msg = c.poll(timeout=1.0)
            if msg is None:
                continue
            if msg.error():
                if msg.error().code() == KafkaError._PARTITION_EOF:
                    # End of partition, consumer reached end of the topic
                    break
                elif msg.error():
                    raise KafkaException(msg.error())
            else:
                try:
                    # Deserialize JSON data
                    data = json.loads(msg.value().decode('utf-8'))
                    received_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    continue  # Skip to the next message

    except KeyboardInterrupt:
        pass

    finally:
        # Close the consumer
        c.close()

        # Save received data to a JSON file
        with open(file_path, 'w') as json_file:
            json.dump(received_data, json_file, indent=4)

# Specify the file path to save the JSON data
json_file_path = 'Departures.json'

# Call the function to consume data from Kafka and save it to JSON
consume_and_save_to_json(json_file_path)


In [7]:
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import re
from datetime import datetime
import time

# Define the function to click the "Load earlier flights" button until it disappears or after a certain number of attempts
def click_load_earlier_until_disappear(driver, max_attempts=5):
    for _ in range(max_attempts):
        try:
            button = driver.find_element(By.XPATH, '//button[contains(@class, "btn-flights-load") and contains(text(), "Load earlier flights")]')
            driver.execute_script("arguments[0].click();", button)
            time.sleep(2)  # Adjust the sleep time according to your needs
        except (NoSuchElementException, ElementNotInteractableException):
            break

# Define the function to click the "Load later flights" button until it disappears or after a certain number of attempts
def click_load_later_until_disappear(driver, max_attempts=5):
    for _ in range(max_attempts):
        try:
            button = driver.find_element(By.XPATH, '//button[contains(@class, "btn-flights-load") and contains(text(), "Load later flights")]')
            driver.execute_script("arguments[0].click();", button)
            time.sleep(2)  # Adjust the sleep time according to your needs
        except (NoSuchElementException, ElementNotInteractableException):
            break

def save_data_to_csv(data, airport_name, current_date):
    # Remove invalid characters from the airport name
    airport_name = re.sub(r'[\\/:"*?<>|]+', '', airport_name)
    
    # Create folder path with date attached
    folder_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}'  # Change this to the desired folder path
    os.makedirs(folder_path, exist_ok=True)  # Create directory if it doesn't exist
    
    # Create file path with airport name and date attached
    file_path = os.path.join(folder_path, f"Departures_{airport_name}_{current_date}.csv")  # Add .csv extension
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

def get_data_with_date(data,time_difference):
    current_year = datetime.now().year
    
    # Initialize variables to store the last encountered date
    last_date = ''
    last_date_with_year = ''
    
    # Iterate through the data to append date with year as needed
    for row in data:
        if row[0].startswith(('Monday,', 'Tuesday,', 'Wednesday,', 'Thursday,', 'Friday,', 'Saturday,', 'Sunday,')):
            last_date = row[0]
            last_date_with_year = last_date + " " + str(current_year)
        elif last_date_with_year:  # Append date with year if it exists
            row.append(last_date_with_year)
            row.append(time_difference)

    
    return data



def scrape_departures(airport_list, num_rows=None):
    # Determine the number of rows to iterate based on the provided parameter
    if num_rows is not None:
        airport_list = airport_list.head(num_rows)
    
    # Get current date
    current_date = datetime.now().strftime('%Y-%m-%d')
    
    # Checkpoint file path
    checkpoint_file_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}/DEPcheckpoint.csv'
    
    # Check if checkpoint file exists
    if os.path.isfile(checkpoint_file_path):
        checkpoint_df = pd.read_csv(checkpoint_file_path)
        processed_airports = set(checkpoint_df['Name'])
    else:
        processed_airports = set()
    
    # options = webdriver.EdgeOptions()
    # options.add_argument('--headless')
 
   
    # Loop through the specified number of rows in the airport list
    for _, airport in airport_list.iterrows():
        if airport['Name'] in processed_airports:
            continue
        
        data = []
        
        # Append '/departures' to the link
        departure_link = airport['Link'] + '/departures'
        options = webdriver.EdgeOptions()
        options.add_argument('--headless')
 
        driver = webdriver.Edge()
        driver.get(departure_link)
        
        time.sleep(4)

        # Wait for the button to appear
        try:
            button = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
            driver.execute_script("arguments[0].click();", button)
        except NoSuchElementException:
            pass

        time.sleep(5)

        button = driver.find_element(By.ID,"auth-button")
        driver.execute_script("arguments[0].click();", button)
        email = "iheb.benjeddi9573@gmail.com"
        password = "IHEBjihedAziz2024!!?"
        EmailField = driver.find_element(By.XPATH,"/html/body/div[10]/div/div/div/form/div[1]/div/input")
        passwordField = driver.find_element(By.XPATH,"/html/body/div[10]/div/div/div/form/div[2]/div/input")
        EmailField.send_keys(email)
        passwordField.send_keys(password)
        login = driver.find_element(By.XPATH,"/html/body/div[10]/div/div/div/form/button")
        driver.execute_script("arguments[0].click();", login)
        
        
       

# Read the CSV file into a DataFrame
airport_list = pd.read_csv("airport_list.csv", header=None, names=["Name", "Link"])

# Specify the number of rows to scrape (if desired), or leave it as None to scrape all rows
num_rows_to_scrape = None # Change this to the desired number of rows, or set it to None

# Call the function with the airport_list and the specified number of rows
scrape_departures(airport_list, num_rows_to_scrape)


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: MicrosoftEdge=122.0.2365.92)
Stacktrace:
	GetHandleVerifier [0x00007FF7E9F6B812+63538]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7E9EEF692+238274]
	(No symbol) [0x00007FF7E9D23407]
	(No symbol) [0x00007FF7E9D00AB7]
	(No symbol) [0x00007FF7E9D8F019]
	(No symbol) [0x00007FF7E9DA1882]
	(No symbol) [0x00007FF7E9D88A83]
	(No symbol) [0x00007FF7E9D5E55E]
	(No symbol) [0x00007FF7E9D5D71C]
	(No symbol) [0x00007FF7E9D5E111]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF7EA141CFC+1102764]
	(No symbol) [0x00007FF7E9DD25F6]
	(No symbol) [0x00007FF7E9E63DAC]
	(No symbol) [0x00007FF7E9E5BDA8]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF7EA140C45+1098485]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7E9EF9E51+281217]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7E9EF45D4+258564]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7E9EF470F+258879]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7E9EE8EE1+211729]
	BaseThreadInitThunk [0x00007FFF65AD257D+29]
	RtlUserThreadStart [0x00007FFF66B6AA58+40]


In [4]:
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import re
from datetime import datetime
import time

# Define the function to click the "Load earlier flights" button until it disappears or after a certain number of attempts
def click_load_earlier_until_disappear(driver, max_attempts=5):
    for _ in range(max_attempts):
        try:
            button = driver.find_element(By.XPATH, '//button[contains(@class, "btn-flights-load") and contains(text(), "Load earlier flights")]')
            driver.execute_script("arguments[0].click();", button)
            time.sleep(2)  # Adjust the sleep time according to your needs
        except (NoSuchElementException, ElementNotInteractableException):
            break

# Define the function to click the "Load later flights" button until it disappears or after a certain number of attempts
def click_load_later_until_disappear(driver, max_attempts=5):
    for _ in range(max_attempts):
        try:
            button = driver.find_element(By.XPATH, '//button[contains(@class, "btn-flights-load") and contains(text(), "Load later flights")]')
            driver.execute_script("arguments[0].click();", button)
            time.sleep(2)  # Adjust the sleep time according to your needs
        except (NoSuchElementException, ElementNotInteractableException):
            break

def save_data_to_csv(data, airport_name, current_date):
    # Remove invalid characters from the airport name
    airport_name = re.sub(r'[\\/:"*?<>|]+', '', airport_name)
    
    # Create folder path with date attached
    folder_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}'  # Change this to the desired folder path
    os.makedirs(folder_path, exist_ok=True)  # Create directory if it doesn't exist
    
    # Create file path with airport name and date attached
    file_path = os.path.join(folder_path, f"Departures_{airport_name}_{current_date}.csv")  # Add .csv extension
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

def get_data_with_date(data,time_difference):
    current_year = datetime.now().year
    
    # Initialize variables to store the last encountered date
    last_date = ''
    last_date_with_year = ''
    
    # Iterate through the data to append date with year as needed
    for row in data:
        if row[0].startswith(('Monday,', 'Tuesday,', 'Wednesday,', 'Thursday,', 'Friday,', 'Saturday,', 'Sunday,')):
            last_date = row[0]
            last_date_with_year = last_date + " " + str(current_year)
        elif last_date_with_year:  # Append date with year if it exists
            row.append(last_date_with_year)
            row.append(time_difference)

    
    return data



def scrape_departures(airport_list, num_rows=None):
    # Determine the number of rows to iterate based on the provided parameter
    if num_rows is not None:
        airport_list = airport_list.head(num_rows)
    
    # Get current date
    current_date = datetime.now().strftime('%Y-%m-%d')
    
    # Checkpoint file path
    checkpoint_file_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}/DEPcheckpoint.csv'
    
    # Check if checkpoint file exists
    if os.path.isfile(checkpoint_file_path):
        checkpoint_df = pd.read_csv(checkpoint_file_path)
        processed_airports = set(checkpoint_df['Name'])
    else:
        processed_airports = set()
    
    
    # is_logged_in = False
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    # driver = webdriver.Edge()
 
   
    # Loop through the specified number of rows in the airport list
    for _, airport in airport_list.iterrows():
        if airport['Name'] in processed_airports:
            continue
        
        data = []
        driver = webdriver.Edge(options=options)
        # Append '/departures' to the link
        departure_link = airport['Link'] + '/departures'
        driver.get(departure_link)
        
        time.sleep(4)

        # Wait for the button to appear
        try:
            button = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
            driver.execute_script("arguments[0].click();", button)
        except NoSuchElementException:
            pass

       
        
        # if not is_logged_in:
        #     time.sleep(9)  # Sleep for 5 seconds before login
        #     button = driver.find_element(By.ID, "auth-button")
        #     driver.execute_script("arguments[0].click();", button)
        #     email = "iheb.benjeddi9573@gmail.com"
        #     password = "IHEBjihedAziz2024!!?"
        #     EmailField = driver.find_element(By.XPATH, "/html/body/div[10]/div/div/div/form/div[1]/div/input")
        #     passwordField = driver.find_element(By.XPATH, "/html/body/div[10]/div/div/div/form/div[2]/div/input")
        #     EmailField.send_keys(email)
        #     passwordField.send_keys(password)
        #     login = driver.find_element(By.XPATH, "/html/body/div[10]/div/div/div/form/button")
        #     driver.execute_script("arguments[0].click();", login)
        #     time.sleep(2)
        #     # Set flag to True after logging in once
        #     is_logged_in = True
        

        # Call the function to repeat the process 20 times for loading earlier flights
        click_load_earlier_until_disappear(driver)
        
        # Call the function to repeat the process 20 times for loading later flights
        click_load_later_until_disappear(driver)

        # Get the HTML content of the page after loading all flights
        html = driver.page_source
        
        # Parse the HTML content
        soup = BeautifulSoup(html, 'html.parser')
        
        # Initialize UTC and Local time strings
        UTC = ''
        Local = ''

        # Regular expression pattern to match the time format HH:MM
        time_pattern = re.compile(r'\d{2}:\d{2}')

        # Loop until the Local time matches the time format
        while not time_pattern.match(Local):
            # Fetch UTC and Local time elements here
            UTC_elements = soup.find('span', class_='text-base')
            if UTC_elements:
                UTC = UTC_elements.text.strip()

            Local_elements = soup.find('span', class_='clock-time ng-binding')
            if Local_elements:
                Local = Local_elements.text.strip()
            
            time.sleep(2)
                
        local_time = datetime.strptime(Local,"%H:%M")
        # Parse UTC time string into datetime object with a dummy date
        utc_time = datetime.strptime(UTC,"%H:%M")
        time_difference = utc_time - local_time

        # Find the table element
        table = soup.find('table', class_='table table-condensed table-hover data-table m-n-t-15')
    
        # Extract data from the table
        if table:
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if cells:
                    # Ensure consistent structure of each row
                    row_data = [cell.text.strip() for cell in cells]
                    # If row has fewer fields, add empty strings to match the expected number of fields
                    if len(row_data) < 7:
                        row_data.extend([''] * (7 - len(row_data)))
                    row_data.append(airport['Name'])  # Append airport name to each row
                    data.append(row_data)
        driver.quit()
        # Process data to add date with date
        data = get_data_with_date(data,time_difference)
        
        # Save data to CSV file for the current airport
        save_data_to_csv(data, airport['Name'], current_date)
        
        # Update processed airports set
        processed_airports.add(airport['Name'])
        
        # Save checkpoint
        checkpoint_df = pd.DataFrame({'Name': list(processed_airports)})
        checkpoint_df.to_csv(checkpoint_file_path, index=False)
        

# Read the CSV file into a DataFrame
airport_list = pd.read_csv("airport_list.csv", header=None, names=["Name", "Link"])

# Specify the number of rows to scrape (if desired), or leave it as None to scrape all rows
num_rows_to_scrape = None # Change this to the desired number of rows, or set it to None

# Call the function with the airport_list and the specified number of rows
scrape_departures(airport_list, num_rows_to_scrape)


KeyboardInterrupt: 