In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup  

%matplotlib inline

In [2]:
urlbase = 'https://www.tsa.gov/coronavirus/passenger-throughput?page='
pages = 2

In [5]:
def web_scrape(url, pages):
    """Returns a DataFrame of all pages of TSA website.
        url is the URL of the website.
        pages is the number of pages to scrape."""
    final_list = []
    df = pd.DataFrame()
    for p in range(0, pages):
        response = requests.get(url + str(p)) # Goes to URL and adds p to end of it to select page.
        soup = BeautifulSoup(response.content, 'lxml')
        tables = soup.find_all('table', attrs = {'class': 'views-table views-view-table cols-3'})
        result_list = pd.read_html(str(tables[0]))
        final_list = result_list[0]
        df = df.append(final_list, ignore_index = True)
    return df  

In [6]:
tsa = web_scrape(urlbase, pages)

In [7]:
tsa

Unnamed: 0,Date,Total Traveler Throughput,Total Traveler Throughput (1 Year Ago - Same Weekday)
0,12/27/2020,1284599,2575985
1,12/26/2020,1128773,2470786
2,12/25/2020,616469,2582580
3,12/24/2020,846520,2552194
4,12/23/2020,1191123,1937235
...,...,...,...
297,3/5/2020,2130015,2402692
298,3/4/2020,1877401,2143619
299,3/3/2020,1736393,1979558
300,3/2/2020,2089641,2257920


In [8]:
tsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 3 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   Date                                                   302 non-null    object
 1   Total Traveler Throughput                              302 non-null    int64 
 2   Total Traveler Throughput (1 Year Ago - Same Weekday)  302 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.2+ KB


In [9]:
tsa['Date'] = pd.to_datetime(tsa['Date'], format = '%m/%d/%Y')

In [10]:
tsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 3 columns):
 #   Column                                                 Non-Null Count  Dtype         
---  ------                                                 --------------  -----         
 0   Date                                                   302 non-null    datetime64[ns]
 1   Total Traveler Throughput                              302 non-null    int64         
 2   Total Traveler Throughput (1 Year Ago - Same Weekday)  302 non-null    int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 7.2 KB


In [11]:
tsa.head(2)

Unnamed: 0,Date,Total Traveler Throughput,Total Traveler Throughput (1 Year Ago - Same Weekday)
0,2020-12-27,1284599,2575985
1,2020-12-26,1128773,2470786


In [23]:
tsa.to_csv('../data/tsa.csv')

In [28]:
urlbase = 'https://www.transtats.bts.gov/TRAFFIC/'

In [29]:
def web_scrape(url):
    """Returns a DataFrame of all pages of BTS website.
        url is the URL of the website."""
    final_list = []
    df = pd.DataFrame()
    response = requests.get(url) # Goes to URL.
    soup = BeautifulSoup(response.content, 'lxml')
    tables = soup.find_all('table', attrs = {'name': 'SystemTable2'})
    result_list = pd.read_html(str(tables[0]))
    final_list = result_list[0]
    df = df.append(final_list, ignore_index = True)
    return df  

In [30]:
year_stats = web_scrape(urlbase)

In [31]:
year_stats

Unnamed: 0_level_0,System,October 2019 - September 2020,October 2019 - September 2020,October 2019 - September 2020,October 2018 - September 2019,October 2018 - September 2019,October 2018 - September 2019
Unnamed: 0_level_1,System,Scheduled,Non-Scheduled,Total,Scheduled,Non-Scheduled,Total
0,Revenue Passenger Enplanements (000),511205.0,3132.0,514337.0,916879.0,3606.0,920485.0
1,Revenue Passenger Miles (000),550372500.0,4494582.0,554867100.0,1044065000.0,5095342.0,1049160000.0
2,Available Seat Miles (000),791543700.0,11429163.0,802972900.0,1235277000.0,11630630.0,1246908000.0
3,Passenger Load Factor (%),69.53,39.32,69.1,84.52,43.8,84.14
4,Revenue Freight Ton Miles (000),4497104.0,312378.0,4809482.0,7522249.0,275703.0,7797952.0
5,Total Revenue Ton Miles (000),60185690.0,779852.0,60965540.0,112836900.0,785252.0,113622100.0
6,Available Ton Miles (000),108677100.0,3196511.0,111873600.0,172571100.0,2792852.0,175364000.0
7,Ton Miles Load Factor (%),55.38,24.39,54.49,65.38,28.11,64.79
8,Revenue Departures Performed,6520930.0,115323.0,6636253.0,9410328.0,150807.0,9561135.0
9,Revenue Aircraft Miles Flown (000),5170186.0,71320.0,5241506.0,7849730.0,71511.0,7921241.0
