# TSA passenger checkpoint throughput

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt

In [3]:
import altair as alt
import altair_latimes as lat
import numpy as np

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [6]:
url = "https://www.tsa.gov/coronavirus/passenger-throughput"

In [7]:
header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
  Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
}

In [8]:
r = requests.get(url, headers=header)

In [9]:
tsa_dfs = pd.read_html(r.text)
tsa_tables = pd.DataFrame(tsa_dfs[0])

In [10]:
tsa_tables.head()

Unnamed: 0,Date,2021 Traveler Throughput,2020 Traveler Throughput,2019 Traveler Throughput
0,5/30/2021,1650454.0,352947,2555578
1,5/29/2021,1605810.0,268867,2117180
2,5/28/2021,1959593.0,327133,2570613
3,5/27/2021,1854534.0,321776,2485770
4,5/26/2021,1618169.0,261170,2269035


In [11]:
tsa_tables.rename(
    columns={
        "0": "date",
        "1": "2021 Traveler Throughput",
        "2": "2020 Traveler Throughput",
        "2": "2019 Traveler Throughput",
    },
    inplace=True,
)
tsa_tables.columns = ["date", "2021", "2020", "2019"]

In [12]:
tsa_tables.head()

Unnamed: 0,date,2021,2020,2019
0,5/30/2021,1650454.0,352947,2555578
1,5/29/2021,1605810.0,268867,2117180
2,5/28/2021,1959593.0,327133,2570613
3,5/27/2021,1854534.0,321776,2485770
4,5/26/2021,1618169.0,261170,2269035


In [13]:
tsa_tables.tail()

Unnamed: 0,date,2021,2020,2019
360,6/4/2020,,391882,2623947
361,6/3/2020,,304436,2370152
362,6/2/2020,,267742,2247421
363,6/1/2020,,353261,2499002
364,5/31/2020,,352947,2555578


In [14]:
tsa_tables = tsa_tables.iloc[1:].copy()

### Include previously collected data

In [15]:
archive = pd.read_csv("data/raw/tsa_tables_before_pandemic.csv", parse_dates=["date"])

In [16]:
archive.drop(["Unnamed: 0"], axis=1, inplace=True)

In [17]:
df = pd.concat([tsa_tables, archive])

---

In [23]:
df["date"] = pd.to_datetime(df["date"])

In [24]:
df["month_day"] = df["date"].dt.strftime("%m-%d")

In [25]:
df.head(10)

Unnamed: 0,date,2021,2020,2019,month_day
1,2021-05-29,1605810.0,268867,2117180,05-29
2,2021-05-28,1959593.0,327133,2570613,05-28
3,2021-05-27,1854534.0,321776,2485770,05-27
4,2021-05-26,1618169.0,261170,2269035,05-26
5,2021-05-25,1470840.0,264843,2453649,05-25
6,2021-05-24,1747353.0,340769,2512237,05-24
7,2021-05-23,1863697.0,267451,2070716,05-23
8,2021-05-22,1550044.0,253190,2124825,05-22
9,2021-05-21,1820433.0,348673,2792670,05-21
10,2021-05-20,1728496.0,318449,2673635,05-20


In [32]:
tsa_tables_melt = pd.melt(
    df,
    id_vars=["month_day"],
    value_vars=["2021", "2020", "2019"],
    var_name="year",
    value_name="travelers",
)

In [33]:
tsa_tables_melt.head()

Unnamed: 0,month_day,year,travelers
0,05-29,2021,1605810.0
1,05-28,2021,1959593.0
2,05-27,2021,1854534.0
3,05-26,2021,1618169.0
4,05-25,2021,1470840.0


In [34]:
tsa_tables_melt.tail()

Unnamed: 0,month_day,year,travelers
1261,03-05,2019,2402692.0
1262,03-04,2019,2143619.0
1263,03-03,2019,1979558.0
1264,03-02,2019,2257920.0
1265,03-01,2019,2301439.0


In [35]:
tsa_tables_melt

Unnamed: 0,month_day,year,travelers
0,05-29,2021,1605810.0
1,05-28,2021,1959593.0
2,05-27,2021,1854534.0
3,05-26,2021,1618169.0
4,05-25,2021,1470840.0
...,...,...,...
1261,03-05,2019,2402692.0
1262,03-04,2019,2143619.0
1263,03-03,2019,1979558.0
1264,03-02,2019,2257920.0
