# Retreiving airport routes

For each airport previously found, another Python script opens the URL and analyses the HTML
content of the page to find the "Airline and Destinations" section.

The parsed content evolves with community updates. It was made in April 2023 for this notebook.

In [1]:
import numpy as np
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
from string import Template
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
nam_airport_df=pd.read_csv('data/n_amer_arpt_v2.csv')
sam_airport_df=pd.read_csv('data/s_amer_arpt_v2.csv')
eu_airport_df=pd.read_csv('data/eu_arpt_v2.csv')
af_airport_df=pd.read_csv('data/af_arpt_v2.csv')
as_airport_df=pd.read_csv('data/as_arpt_v2.csv')
oc_airport_df=pd.read_csv('data/oc_arpt_v2.csv')
special_df=pd.read_csv('data/pb_arpt_v2.csv')
missing_df=pd.read_csv('data/missing_arpt_v2.csv')

In [3]:
airport_df_list =[nam_airport_df['wdpa_link'],sam_airport_df['wdpa_link'],eu_airport_df['wdpa_link'],af_airport_df['wdpa_link'],as_airport_df['wdpa_link'],oc_airport_df['wdpa_link'], special_df['wdpa_link'], missing_df['wdpa_link']]


In [None]:
   
relations=[]
### create a long list of airports links instead of opening each to get the IATA.
### Next step will be to merge the list with airport df on wiki link to associate the IATA/ICAO codes and  the other features.
### 

for airport_links in tqdm(airport_df_list):
    print('Starting continent parsings')
    for airport_link in tqdm(airport_links):
        
        headers = {'User-Agent': 'AirProjectBot/0.0 (antoine732@hotmail.fr)'}
        response = requests.get(airport_link, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        ### Airline and Destination = for a route list. Most of the time there is a unique list of airlines but when cargo ops are also proeminennt, they are in another list we don't want for now.
        # Two cases are therefore considered, depending if the page as or not this splitted list. 

        section = soup.find('span', {'id': 'Airlines_and_destinations'})
        subsection = soup.find('span', {'id': 'Passenger'})
        if subsection:
            # print(f"Airlines and Destinations section found for {airport_name} ({icao_code}) at {airport_link}")
            table = subsection.find_next('table', {'class': 'wikitable sortable'})
            if table:
                for row in table.find_all('tr'):
                    cells = row.find_all('td')
                    if cells:
                        for link in cells[0].find_all('a'):
                            href = link.get('href')
                            if href and '/wiki/' in href:
                                airline = ('https://en.wikipedia.org' + href)
                        regular='Regular'
                        for link in cells[1].find_all():
                            if('Seasonal:' in link):
                                regular='Seasonal'
                            if('Seasonal charter:' in link):
                                regular='Seasonal charter'
                            href = link.get('href')
                            if href and '/wiki/' in href:
                                destination = ('https://en.wikipedia.org' + href)
                                relations.append([airline, airport_link, destination, regular])

            else:
                pass# print("\tNo Airlines and Destinations table found")
        elif section:
            # print(f"Airlines and Destinations section found for {airport_name} ({icao_code}) at {airport_link}")
            table = section.find_next('table', {'class': 'wikitable sortable'})
            if table:
                for row in table.find_all('tr'):
                    cells = row.find_all('td')
                    if cells:
                        for link in cells[0].find_all('a'):
                            href = link.get('href')
                            if href and '/wiki/' in href:
                                airline = ('https://en.wikipedia.org' + href)
                        regular='Regular'
                        for link in cells[1].find_all():
                            if('Seasonal:' in link):
                                regular='Seasonal'
                            if('Seasonal charter:' in link):
                                regular='Seasonal charter'
                            href = link.get('href')
                            if href and '/wiki/' in href:
                                destination = ('https://en.wikipedia.org' + href)
                                relations.append([airline, airport_link, destination, regular])
            else:
                pass# print("\tNo Airlines and Destinations table found")

        else:
            pass# print(f"No Airlines and Destinations section found for {airport_name} ({icao_code}) at {airport_link}")
    print('End of continent parsing. Next Continent')
print('End of all loops')
    



    


In [5]:
relation_df=pd.DataFrame(relations)
relation_df.to_csv('data/wikipedia_relations_new24_04.csv')