**Fetching all USA airports information

In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys


from typing import Dict, List
from tqdm import tqdm_notebook
from fake_useragent import UserAgent

In [2]:
url_main = "https://www.flightconnections.com/"
url_codes = "airport-codes"

In [3]:
def get_soup(url):
    res = requests.get(url, headers={'User-Agent': UserAgent().chrome})
    html = res.content
    soup = BeautifulSoup(html, 'lxml')
    return soup

In [4]:
soup = get_soup(url_main + url_codes)

In [5]:
airport_list = soup.findAll("ul", attrs={"class":"airport-list"})
airport_list = [airport_group.findAll("li") for airport_group in airport_list]
airport_list = sum(airport_list, [])

In [6]:
def get_airport_info(airport)->Dict[str,str]:
    link = airport.find("a").get("href")
    link = link.replace("-to-", "-from-")
    code = airport.find("span", attrs={"class":"airport-code"}).text
    name = airport.find("span", attrs={"class":"airport-name"}).text
    city_country = airport.find("span", attrs={"class":"airport-city-country"}).text
    cc = city_country.split(", ")
    city = cc[0]
    country = cc[1]
    return {"link":link, "code":code, "name":name, "city":city, "country":country}

In [7]:
airport_info = []

for airport in tqdm_notebook(airport_list):
    airport = get_airport_info(airport)
    if airport["country"] == "United States":
        airport_info.append(airport)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for airport in tqdm_notebook(airport_list):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3754.0), HTML(value='')))




In [8]:
result = pd.DataFrame(airport_info)
result.to_csv("../data/airport_info.csv")

In [9]:
airport_info = pd.read_csv("../data/airport_info.csv", index_col=0)
airport_info.head()

Unnamed: 0,link,code,name,city,country
0,/flights-from-allentown-abe,ABE,Lehigh Valley International Airpo,Allentown,United States
1,/flights-from-abilene-abi,ABI,Abilene Regional Airport,Abilene,United States
2,/flights-from-ambler-abl,ABL,Ambler Airport (FAA: AFM),Ambler,United States
3,/flights-from-albuquerque-abq,ABQ,Albuquerque International Sunport,Albuquerque,United States
4,/flights-from-aberdeen-abr,ABR,Aberdeen Regional Airport,Aberdeen,United States


## Visiting every airport link in USA and finding connections in USA

In [10]:
def get_airport_info(destination, source_airport)->Dict[str,str]:
    name = destination.find("div", attrs={"class":"popular-destination-airport-name"}).text
    destination_code = re.findall("\((.*?)\)", name)[0]
    country = destination.find("img").get("title")
    
    destination_flights_per_month = destination.find("span", attrs={'class':"float-right"}).text.strip()
    destination_flights_per_month = re.findall('\d+', destination_flights_per_month)[0]
    if destination_flights_per_month.isdigit():
        destination_flights_per_month = int(destination_flights_per_month)
    
    return {"source_code":source_airport, "destination_code": destination_code, "flights_per_month": destination_flights_per_month, "country":country}

In [11]:
def get_destinations_info(url_airport, source_airport)->List[Dict]:
#     print(url_main)
#     print(url_airport)
    flights_info = get_soup(url_main + url_airport)
    
    popular_destinations = flights_info.findAll("a", attrs={"class":"popular-destination btn"})
    popular_destinations_hidden = soup.findAll("a", attrs={"class":"popular-destination btn hide"})
    popular_destinations = popular_destinations_hidden + popular_destinations
#     print(popular_destinations)
    
    destinations_info = []

    for destination in tqdm_notebook(popular_destinations):
        destination = get_airport_info(destination, source_airport)
        if destination["country"] == "United States":
            destinations_info.append(destination)
    return destinations_info

In [None]:
full_destination_routes = []

exceptions = []

for row in tqdm_notebook(airport_info.itertuples(), total=airport_info.shape[0]):
    url_airport = row.link
#     print(url_airport)
    source_airport = row.code
#     print(source_airport)
    try:
        destinations = get_destinations_info(url_airport, source_airport)
        full_destination_routes.extend(destinations)
    except Exception as e:
        if e == KeyboardInterrupt:
            break            
        else:
            print(e, url_airport, source_code)
        exceptions.append([e, url_airport, source_code])
    time.sleep(0.5)

In [13]:
result = pd.DataFrame(full_destination_routes)
result.to_csv("../data/routes.csv")

In [14]:
result.shape

(3748, 4)

In [15]:
result

Unnamed: 0,source_code,destination_code,flights_per_month,country
0,ABE,DTW,88,United States
1,ABE,CLT,79,United States
2,ABE,ORD,56,United States
3,ABE,ATL,42,United States
4,ABE,SFB,31,United States
...,...,...,...,...
3743,YAK,JNU,30,United States
3744,YAK,CDV,30,United States
3745,YKM,SEA,62,United States
3746,YUM,PHX,98,United States
