# Scraping airport data and using threading to make it not take all year.

My dad wanted the distance from a few airports to many other airports and the resulting data written to an excel file. I'll explore threading a bit to see how it speeds up relatively lengthy web scraping tasks.

In [2]:
from html_table_parser.parser import HTMLTableParser
from pprint import pprint
import urllib.request
import pandas as pd
import numpy as np
import stringcase
import re
import concurrent.futures 
import time

In [5]:
def get_airport(airport):
    
    """
    Returns dataframe 
    
    args: airport = airport IATA code
    
    """
    url = f'https://www.aircalculator.com/great_circle_flight_routes.php?from={airport}'

    agent = \
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)\
        AppleWebKit/537.36\
        (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'                                       
    req = urllib.request.Request(url=url,
                                     headers={'User-Agent': agent})
    f = urllib.request.urlopen(req)
    xhtml = f.read().decode('utf-8')
    p = HTMLTableParser()
    p.feed(xhtml)
    df = pd.DataFrame(p.tables[0])
    df.columns = df.iloc[0]
    df = df.iloc[1:,0:-1]
    return df
    

In [6]:
get_airport('DAC')

Unnamed: 0,IATA Code,Airport Name,Airport Location,Miles from DAC
1,AAA,Anaa Airport,"Anaa, Tuamotus, French Polynesia",8815
2,AAB,Arrabury Airport,"Arrabury, Queensland, Australia",4859
3,AAC,El Arish International Airport,"El Arish, Egypt",3468
4,AAD,Ad-Dabbah Airport,"Al Dabbah, Sudan",3831
5,AAE,Rabah Bitat Airport,"Annaba, Algeria",4859
...,...,...,...,...
6234,ZWS,Stuttgart Hauptbahnhof,"Stuttgart, Germany",4606
6235,ZYI,Zunyi Xinzhou Airport,"Zunyi, Guizhou, China",1064
6236,ZYL,MAG Osmani International Airport,"Sylhet, Bangladesh",120
6237,ZZU,Mzuzu Airport,"Mzuzu, Malawi",4520


In [10]:
# just a list of random airports to test performance

airports_list = get_airport('DAC')['IATA Code'].unique()[0:10]

## Airport information

In [6]:
def airport_info(airport):
    url = f'https://www.aircalculator.com/flightplan.php?from={airport}&to=AAA'

    agent = \
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)\
        AppleWebKit/537.36\
        (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'                                       
    req = urllib.request.Request(url=url,
                                     headers={'User-Agent': agent})
    f = urllib.request.urlopen(req)
    xhtml = f.read().decode('utf-8')
    p = HTMLTableParser()
    p.feed(xhtml)
    df = pd.DataFrame(p.tables[0])
    df = df.iloc[0:9]
    df = df.T
    df.columns = df.iloc[0]
    return df.iloc[1:]
    

# Excel writer

In [8]:
def write_excel(air_list):
    
    """
    pass a list of airport IATA codes
    
    """
#Uncomment to use    

# writer = pd.ExcelWriter('dataframes.xlsx', engine='xlsxwriter')

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     results = [executor.submit(get_airport,port) for port in airports_list]
    
#     data_frames = [df.result() for df in concurrent.futures.as_completed(results)]

# for df,IATA in zip(data_frames,airports_list):
#     df.to_excel(writer, sheet_name=IATA)

# writer.save()

# Checking performance

## No threads

In [11]:
t1 = time.perf_counter()

for port in airports_list:
    get_airport(port)

t2 = time.perf_counter()

print(f'Finished in {t2-t1} seconds')

Finished in 76.85551009999995 seconds


## With threading

In [12]:
t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_airport, airports_list)

t2 = time.perf_counter()

print(f'Finished in {t2-t1} seconds')

Finished in 16.740458399999852 seconds


## Threading + list comprehension

In [22]:
t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = [executor.submit(get_airport,port) for port in airports_list]
    
    data_frames = [df.result() for df in concurrent.futures.as_completed(results)]

t2 = time.perf_counter()

print(f'Finished in {t2-t1} seconds')

Finished in 18.71802600000001 seconds
