In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
basic_directory = Path.cwd()                            # directory of the file - folder "code"
data_directory = basic_directory.parent / "data"        # go one level up and choose folder "data"

# It's a common convention to add a _df suffix to a variable name to indicate it's a DataFrame.
flights_df  = pd.read_csv(data_directory / "flights.csv")
airlines_df = pd.read_csv(data_directory / "airlines.csv")
airports_df = pd.read_csv(data_directory / "airports.csv")
planes_df   = pd.read_csv(data_directory / "planes.csv")
weather_df  = pd.read_csv(data_directory / "weather.csv")

# Convert to datetime
flights_df['time_hour'] = pd.to_datetime(flights_df['time_hour'])
weather_df['time_hour'] = pd.to_datetime(weather_df['time_hour'])

# Remove timezone
weather_df['time_hour'] = weather_df['time_hour'].dt.tz_localize(None)

'''
Convert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.
This was necessary because a merge on date/time types was not possible otherwise.

The dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.
'''

"\nConvert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.\nThis was necessary because a merge on date/time types was not possible otherwise.\n\nThe dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.\n"

In [None]:
# Airlines with CASE logic

airlines_part = (
    airlines_df[['name']]
    .head(15)
    .assign(
        entity_type=lambda df: np.where(
            df['name'].str[0].str.upper().isin(['A','E','I','O','U']),  # .str[0] - takes the first letter of each airline name
            'airlines_vowel',                                           # .str.upper() - capitalizes this letter (so that "A" and "a" are not different)
            'airlines_consonant'                                        # .isin([‘A’,'E',‘I’,'O',‘U’]) - checks whether the word begins with a vowel
        )
    )
)

airports_part = (
    airports_df[['name']]
    .head(5)
    .assign(entity_type='airports')
)

planes_part = (
    planes_df[['manufacturer']]
    .head(5)
    .rename(columns={'manufacturer':'name'})    # Renamed 'manufacturer' to 'name' so that all DataFrames have a common column for names
    .assign(entity_type='manufacturer')
)

union_all = pd.concat([airlines_part, airports_part, planes_part], ignore_index=True)
print(union_all)
print(union_all.shape)

                             name         entity_type
0               Endeavor Air Inc.      airlines_vowel
1          American Airlines Inc.      airlines_vowel
2            Alaska Airlines Inc.      airlines_vowel
3                 JetBlue Airways  airlines_consonant
4            Delta Air Lines Inc.  airlines_consonant
5        ExpressJet Airlines Inc.      airlines_vowel
6          Frontier Airlines Inc.  airlines_consonant
7     AirTran Airways Corporation      airlines_vowel
8          Hawaiian Airlines Inc.  airlines_consonant
9                       Envoy Air      airlines_vowel
10          SkyWest Airlines Inc.  airlines_consonant
11          United Air Lines Inc.      airlines_vowel
12                US Airways Inc.      airlines_vowel
13                 Virgin America  airlines_consonant
14         Southwest Airlines Co.  airlines_consonant
15              Lansdowne Airport            airports
16  Moton Field Municipal Airport            airports
17            Schaumburg Reg

In [11]:
# Airlines that fly from all three NYC airports

intersect = flights_df.loc[flights_df["origin"]=="JFK", "carrier"] \
    .isin(flights_df.loc[flights_df["origin"]=="LGA", "carrier"]) \
    & flights_df.loc[flights_df["origin"]=="JFK", "carrier"] \
    .isin(flights_df.loc[flights_df["origin"]=="EWR", "carrier"])

result = flights_df.loc[flights_df["origin"]=="JFK", "carrier"][intersect].unique()

print(result)
print(len(result))

# flights_df.loc[flights_df["origin"]=="JFK", "carrier"] - takes the "carrier" column from rows where ["origin"]=="JFK"
# .isin(flights_df.loc[flights_df["origin"]=="LGA", "carrier"]) - returns a True/False array where "carrier" from ["origin"]=="JFK" also appears among ["origin"]=="LGA"
# Repeated once more, but with "JFK" and "EWR"

# flights_df.loc[flights_df["origin"]=="JFK", "carrier"][intersect].unique()
# Again, takes the carriers from JFK
# [intersect] - filters only the rows where intersect == True
# .unique() - returns the unique carrier values (it is used to avoid repeating the airline names multiple times)

'''
# flights_df["origin"]=="JFK" - creates a boolean array (True/False) for all values in 'origin'. True if "JFK".
# flights_df.loc[...] - selects only those rows where flights_df["origin"]=="JFK" is True.
# ..., "carrier"] - returns only the "carrier" column from the selection.

# .isin() checks if a value is present in another list (e.g., if a carrier from JFK is also in LGA).

# .unique() is used to avoid repeating the airline names multiple times.
'''

'''
# An alternative to .isin, which is more convenient here.

jfk = set(flights_df.loc[flights_df["origin"]=="JFK", "carrier"])
lga = set(flights_df.loc[flights_df["origin"]=="LGA", "carrier"])
ewr = set(flights_df.loc[flights_df["origin"]=="EWR", "carrier"])

# The intersection of three sets
intersect_carriers = jfk & lga & ewr
print(intersect_carriers)

'''


['B6' 'AA' 'UA' 'DL' '9E' 'US' 'MQ' 'EV']
8


'\n# An alternative to .isin, which is more convenient here.\n\njfk = set(flights_df.loc[flights_df["origin"]=="JFK", "carrier"])\nlga = set(flights_df.loc[flights_df["origin"]=="LGA", "carrier"])\newr = set(flights_df.loc[flights_df["origin"]=="EWR", "carrier"])\n\n# The intersection of three sets\nintersect_carriers = jfk & lga & ewr\nprint(intersect_carriers)\n\n'

**Key Finding:** 
Only 8 of the 16 airlines listed in the database operate flights from all three major New York airports. 
This suggests that operating from all three major New York airports is relatively uncommon among airlines

In [7]:
# Destinations served by both American Airlines and Delta

aa_dests = set(flights_df.loc[flights_df['carrier']=="AA", "dest"])
dl_dests = set(flights_df.loc[flights_df['carrier']=="DL", "dest"])
intersect = aa_dests & dl_dests
print(intersect)
# The data type is set, shown with {} brackets
# Arbitrary output order

intersect1 = flights_df.loc[flights_df['carrier']=="AA", "dest"] \
            .isin(flights_df.loc[flights_df['carrier']=="DL", "dest"])
intersect1 = flights_df.loc[flights_df['carrier']=="AA", "dest"][intersect1].unique()
print(intersect1)
# The data type is numpy.ndarray, shown with [] brackets
# Keeps the order of the source data

print(set(intersect1) == intersect)
# Different ordering rules (set() -- arbitrary order, .unique() -- original order)
# The values are identical - only the display order changes

{'MIA', 'FLL', 'LAX', 'BOS', 'MCO', 'SFO', 'AUS', 'PBI', 'STT', 'SEA', 'TPA', 'SAN', 'LAS', 'SJU'}
['BOS' 'SJU' 'MIA' 'SFO' 'LAX' 'STT' 'MCO' 'AUS' 'LAS' 'SEA' 'SAN' 'FLL'
 'TPA' 'PBI']
True


**Key Finding**: 
American Airlines (AA) and Delta Air Lines (DL) serve 14 common destinations

**Insight**:
This significant route overlap between two major airlines indicates intense competition on popular routes including transcontinental (LAX, SFO, SEA) and Florida (MIA, FLL, MCO) markets

In [21]:
#EXCEPT
# Which airports are arrival destinations but never departure origins?

dest_set = set(flights_df['dest'])
origin_set = set(flights_df['origin'])

result = dest_set - origin_set

print(len(result))
print(result)
'''
SELECT dest
FROM flights
EXCEPT
SELECT origin
FROM flights;
'''

99
{'TYS', 'DFW', 'JAX', 'CHO', 'SAV', 'JAC', 'ORD', 'ANC', 'IAH', 'ALB', 'MHT', 'GSP', 'RSW', 'CLE', 'GRR', 'SJC', 'MVY', 'CVG', 'BDL', 'SNA', 'DTW', 'EYW', 'PHX', 'ACK', 'LAS', 'IAD', 'SJU', 'SLC', 'PSE', 'MKE', 'AUS', 'CAE', 'EGE', 'TVC', 'MIA', 'CRW', 'XNA', 'BOS', 'ATL', 'TUL', 'MCI', 'MDW', 'BHM', 'STL', 'GSO', 'ILM', 'MEM', 'BTV', 'PDX', 'PHL', 'MSN', 'CHS', 'BQN', 'HNL', 'PVD', 'FLL', 'LAX', 'PIT', 'ORF', 'SMF', 'MSY', 'DCA', 'BUR', 'OMA', 'OKC', 'PWM', 'DAY', 'HOU', 'AVL', 'PBI', 'SEA', 'CMH', 'BZN', 'SAN', 'SDF', 'OAK', 'LGB', 'MYR', 'CLT', 'BGR', 'MCO', 'RDU', 'MSP', 'TPA', 'BWI', 'DSM', 'STT', 'SAT', 'SRQ', 'BUF', 'SYR', 'IND', 'SFO', 'DEN', 'BNA', 'CAK', 'RIC', 'ROC', 'ABQ'}


'\nSELECT dest\nFROM flights\nEXCEPT\nSELECT origin\nFROM flights;\n'

In [26]:
# Which airports are departure origins but never arrival destinations?

dest_set = set(flights_df['dest'])
origin_set = set(flights_df['origin'])

result = origin_set - dest_set
print(result)
# So it means all flights are from NYC to other US cities

{'JFK', 'EWR', 'LGA'}


**Key Finding:**
This shows that the database being analyzed collected flight statistics only from three New York airports (3) to other US airports (99)   

In [11]:
# Aircraft that are in the registry but have never flown

pl_tail = set(planes_df['tailnum'])
fl_tail = set(flights_df['tailnum'].dropna())   # .dropna() removes NaN tailnum in flights_df, IF EXISTS

result = pl_tail - fl_tail

print(len(result), "aircraft have never flown")
print(result)


998 aircraft have never flown
{'N516LR', 'N198UW', 'N828MH', 'N956DN', 'N937AT', 'N704US', 'N506MJ', 'N8602F', 'N657UA', 'N940UW', 'N646DL', 'N193DN', 'N766SK', 'N512SW', 'N630VA', 'N289CT', 'N153DL', 'N8612K', 'N673UA', 'N618AA', 'N347AA', 'N941FR', 'N643SW', 'N510SW', 'N906DA', 'N360AA', 'N772SK', 'N361VA', 'N550NW', 'N647SW', 'N585AS', 'N453WN', 'N966AT', 'N740SK', 'N642WN', 'N623SW', 'N121UW', 'N371SW', 'N551AA', 'N632AW', 'N8308K', 'N631AW', 'N700UW', 'N562AS', 'N242WN', 'N543US', 'N726SK', 'N571UW', 'N8837B', 'N505MJ', 'N245AY', 'N121DE', 'N453UW', 'N523SW', 'N547UA', 'N628SW', 'N326AT', 'N717SA', 'N8888D', 'N694DL', 'N650SW', 'N622AW', 'N536AA', 'N388SW', 'N978SW', 'N746SW', 'N415WN', 'N559AS', 'N613DL', 'N565AS', 'N638SW', 'N64809', 'N764SW', 'N201LV', 'N693CA', 'N403AS', 'N302AS', 'N441WN', 'N757AT', 'N525AS', 'N825MH', 'N368SW', 'N907JB', 'N215WN', 'N372AA', 'N955AT', 'N914FJ', 'N430WN', 'N917WN', 'N449US', 'N842MH', 'N762SK', 'N8944B', 'N249WN', 'N401WN', 'N345NW', 'N478WN',

**Key Finding:**
998 aircraft in the registry have never conducted any flights.

**Insight:**
This represents significant unused capacity in the aviation system