# Extraction of the dataset 

Code to log onto the airdata website and extract information to be stored onto a database. 

In [1]:
import selenium 

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

import json, os

In [2]:
import airdata_login

In [3]:
# extract params
login_URL = airdata_login.login_page_URL
driver = airdata_login.driver 

In [4]:
# set login info
# don't have to repeat this if already logged in once 
cookies_filename = 'login_11_2_2023_cookies.json'
airdata_login.new_login_to_airdata(driver,login_URL,savecookies=True,cookies_filename=cookies_filename, login_details= ('pietro.cicuta@gmail.com','cicutadrone1'))

Cookies successfully saved.


In [5]:
# testing load_cookies 
cookies_filename = 'login_11_2_2023_cookies.json'
driver.get(login_URL)
airdata_login.load_cookies(driver, cookies_filename,login_URL)

now navigate to the flight log page

In [6]:
flight_log_URL = 'https://app.airdata.com/flight/last'
driver.get(flight_log_URL)


After inspection of the HTML of the flight logs page, it seems that the links are stored in a  DataTables structure that includes pagination for flight logs, along with a date and time for each flight. Goal is to extract the relevent info about each flight and keep track of the date and time of flight too so will store this in a pandas dataframe 

In [7]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [8]:
flight_data_df = pd.DataFrame(columns=['Page URL', 'CSV Filename']).set_index(pd.Index([], name='Time'))

flight_data_df

Unnamed: 0_level_0,Page URL,CSV Filename
Time,Unnamed: 1_level_1,Unnamed: 2_level_1


In [9]:
from datetime import datetime

In [13]:
# create a WebDriverWait object 
wait = WebDriverWait(driver, 10)  # 10 seconds wait time

driver.get(flight_log_URL)
flight_data_df = pd.DataFrame(columns=['Page URL', 'CSV Filename', 'CSV download link']).set_index(pd.Index([], name='Time'))

def handle_csv_download():
    try:
        csv_link = wait.until(EC.element_to_be_clickable((By.ID, "csv_link")))
        # If the CSV link found, click it to download the file
        driver.execute_script("arguments[0].click();", csv_link)
    except TimeoutException:
        print("CSV link not found for the current flight detail page.")

# Function to extract flight details to then be stored in the dataframe
def process_flight_row(row):
    flight_info = row.find_elements(By.TAG_NAME, "td")
    if flight_info and flight_info[1].text != 'OVERVIEW':
        flight_date_time_text = flight_info[1].text.replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')
        flight_dt_object = datetime.strptime(flight_date_time_text, "%b %d, %Y %I:%M%p")
        flight_datetime_index = flight_dt_object.strftime('%Y-%m-%d %H:%M:%S')

        # Open flight detail in a new tab
        driver.execute_script("window.open(arguments[0]);", flight_info[1].find_element(By.TAG_NAME, "a").get_attribute('href'))
        driver.switch_to.window(driver.window_handles[1])
        flight_URL = driver.current_url
        
        # extract the hyperlink for csv download
        original_link_element = row.find_element(By.XPATH, ".//a[contains(@title, 'Download Original Flight Log File')]")
        csv_download_link = 'https://app.airdata.com/'+ original_link_element.get_attribute('href') 
        
        csv_filename_format = flight_dt_object.strftime("%b-%dth-%Y-%I-%M%p") + "-Flight-Airdata.csv"
        
        flight_data_df.loc[flight_datetime_index] = [flight_URL, csv_filename_format,csv_download_link]
        driver.close()  # Close the current tab
        driver.switch_to.window(driver.window_handles[0])  # Switch back to the main tab

# iterate through the sidebar
for i in range(1, 4):  
    next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"#item_list_dt_paginate .paginate_button:nth-child({i})")))
    next_button.click()
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#item_list_dt tbody tr")))
    flight_log_rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#item_list_dt tbody tr")))

    # now iterate through the row of links to each flight log
    for row in flight_log_rows:
        process_flight_row(row)
    # Optional: Add a short pause to ensure the page has fully loaded and to avoid being detected as a bot
    time.sleep(2)
    
flight_data_df.head()      

ElementClickInterceptedException: Message: element click intercepted: Element <a class="paginate_button previous disabled" aria-controls="item_list_dt" data-dt-idx="0" tabindex="0" id="item_list_dt_previous">...</a> is not clickable at point (34, 211). Other element would receive the click: <div id="div_empty2" style="display: inline-block; position: absolute; height: 22px; width: 206px; z-index: 99; top: 201px; background-color: white; left: 21.8047px;">...</div>
  (Session info: chrome-headless-shell=121.0.6167.160)
Stacktrace:
0   chromedriver                        0x0000000107106168 chromedriver + 4673896
1   chromedriver                        0x00000001070fd9c3 chromedriver + 4639171
2   chromedriver                        0x0000000106cf1fdd chromedriver + 397277
3   chromedriver                        0x0000000106d44dee chromedriver + 736750
4   chromedriver                        0x0000000106d42b6a chromedriver + 727914
5   chromedriver                        0x0000000106d403ca chromedriver + 717770
6   chromedriver                        0x0000000106d3f1a5 chromedriver + 713125
7   chromedriver                        0x0000000106d326a7 chromedriver + 661159
8   chromedriver                        0x0000000106d608c2 chromedriver + 850114
9   chromedriver                        0x0000000106d32038 chromedriver + 659512
10  chromedriver                        0x0000000106d60a7e chromedriver + 850558
11  chromedriver                        0x0000000106d7f796 chromedriver + 976790
12  chromedriver                        0x0000000106d60663 chromedriver + 849507
13  chromedriver                        0x0000000106d301cf chromedriver + 651727
14  chromedriver                        0x0000000106d311ae chromedriver + 655790
15  chromedriver                        0x00000001070c6380 chromedriver + 4412288
16  chromedriver                        0x00000001070cb798 chromedriver + 4433816
17  chromedriver                        0x00000001070aad71 chromedriver + 4300145
18  chromedriver                        0x00000001070cc4e6 chromedriver + 4437222
19  chromedriver                        0x000000010709cd3c chromedriver + 4242748
20  chromedriver                        0x00000001070ec208 chromedriver + 4567560
21  chromedriver                        0x00000001070ec3be chromedriver + 4567998
22  chromedriver                        0x00000001070fd603 chromedriver + 4638211
23  libsystem_pthread.dylib             0x00007ff80916a1d3 _pthread_start + 125
24  libsystem_pthread.dylib             0x00007ff809165bd3 thread_start + 15


In [None]:
num_pages = flight_data_df.shape[0]
num_pages # check that all pages have beben included

66

In [None]:
next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"#item_list_dt_paginate .paginate_button:nth-child({i+2})")))
next_button.click()  

In [None]:
# debugging... 
driver.get('https://app.airdata.com/flight/7bfdbc1996afe83aa106bf5ed2207e0e/GENERALOverview')

In [None]:
csv_link = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "csv_link")))
csv_link.click()