In [3]:
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import re

def save_data_to_csv(data, airport_name, current_date):
    # Remove invalid characters from the airport name
    airport_name = re.sub(r'[\\/:"*?<>|]+', '', airport_name)
    
    # Create folder path with date attached
    folder_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}'  # Change this to the desired folder path
    os.makedirs(folder_path, exist_ok=True)  # Create directory if it doesn't exist
    
    # Create file path with airport name and date attached
    file_path = os.path.join(folder_path, f"Weather_{airport_name}_{current_date}.csv")  # Add .csv extension
    
    # Extract fieldnames from data
    fieldnames = set()
    for row in data:
        fieldnames.update(row.keys())
    
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)


def save_checkpoint(processed_airports, current_date):
    checkpoint_file_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}/checkpoint.csv'
    checkpoint_df = pd.DataFrame({'Name': list(processed_airports)})
    checkpoint_df.to_csv(checkpoint_file_path, index=False)

def load_checkpoint(current_date):
    checkpoint_file_path = f'c:/Users/httyd/Desktop/capstone/airports/Data/{current_date}/checkpoint.csv'
    if os.path.isfile(checkpoint_file_path):
        checkpoint_df = pd.read_csv(checkpoint_file_path)
        processed_airports = set(checkpoint_df['Name'])
    else:
        processed_airports = set()
    return processed_airports

def scrape_weather(airport_list, num_rows=None):
    # Get current date
    current_date = time.strftime('%Y-%m-%d')
    
    # Load checkpoint
    processed_airports = load_checkpoint(current_date)
    
    is_logged_in = False

    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    # driver = webdriver.Edge()
   
    # Loop through the specified number of rows in the airport list
    for _, airport in airport_list.iterrows():
        if airport['Name'] in processed_airports:
            continue
        
        data = []
        
        # Append '/weather' to the link
        weather_link = airport['Link'] + '/weather'
        
        # Open the weather link in browser
        driver = webdriver.Edge(options=options)
        driver.get(weather_link)
        
        time.sleep(4)
        
        try:
            button = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
            # Click on the button if it exists
            driver.execute_script("arguments[0].click();", button)
            time.sleep(3)
        except NoSuchElementException:
            pass  # If the button doesn't exist, move on
        
        time.sleep(3)

        # if not is_logged_in:
        #     time.sleep(9)  # Sleep for 5 seconds before login
        #     button = driver.find_element(By.ID, "auth-button")
        #     driver.execute_script("arguments[0].click();", button)
        #     email = "iheb.benjeddi9573@gmail.com"
        #     password = "IHEBjihedAziz2024!!?"
        #     EmailField = driver.find_element(By.XPATH, "/html/body/div[10]/div/div/div/form/div[1]/div/input")
        #     passwordField = driver.find_element(By.XPATH, "/html/body/div[10]/div/div/div/form/div[2]/div/input")
        #     EmailField.send_keys(email)
        #     passwordField.send_keys(password)
        #     login = driver.find_element(By.XPATH, "/html/body/div[10]/div/div/div/form/button")
        #     driver.execute_script("arguments[0].click();", login)
        #     time.sleep(2)
        #     # Set flag to True after logging in once
        #     is_logged_in = True
        
        # Get the HTML content of the page
        html = driver.page_source
        
        # Parse the HTML content
        soup = BeautifulSoup(html, 'html.parser')
        
        # Find the table element
        table = soup.find_all('tr', class_='slave')

        # Extract data from the table
        if table:
            for row in table:
                ul_elements = row.find_all('ul')
                for ul in ul_elements:
                    li_elements = ul.find_all('li')
                    row_data = {}
                    for li in li_elements:
                        text = li.text.strip()
                        key, value = text.split(':', 1)
                        row_data[key.strip()] = value.strip()
                    data.append(row_data)

        # Extract Airport Name instead of ID
        airport_name = airport['Name']

        # Add Airport Name to columns
        for row in data:
            row['Airport_name'] = airport_name
        
                # Find all "master expandable" elements
        
        date_elements = soup.find_all('tr', class_='master expandable')

        # Extract data from each "master expandable" element
        for i, date_element in enumerate(date_elements):
            # Extract the date from the second column (index 1)
            date = date_element.find_all('td')[1].text.strip()

            # Add the date to the corresponding row in the data list
            if i < len(data):
                data[i]['Date'] = date
            else:
                break  # If there are no more rows in the data list, exit the loop

        
        index_elements = soup.find_all('div', class_='chart-center')
        if index_elements:
            for index_element in index_elements:
                index_element = index_element.find('span')
                index = index_element.text.strip()

                # Add the index value to each row in the data list
                for row in data:
                    row['index'] = index

        # Save data to CSV file
        save_data_to_csv(data, airport_name, current_date)
        
        # Update processed airports set
        processed_airports.add(airport_name)
        
        # Save checkpoint
        save_checkpoint(processed_airports, current_date)
        
        # Close the browser
        driver.quit()

# Read the CSV file into a DataFrame
airport_list = pd.read_csv("airport_list.csv", header=None, names=["Name", "Link"])

# Specify the number of rows to scrape (if desired), or leave it as None to scrape all rows
num_rows_to_scrape = None # Change this to the desired number of rows, or set it to None

# Call the function with the airport_list and the specified number of rows
scrape_weather(airport_list, num_rows_to_scrape)


KeyboardInterrupt: 