In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
basic_directory = Path.cwd()                            # directory of the file - folder "code"
data_directory = basic_directory.parent / "data"        # go one level up and choose folder "data"

# It's a common convention to add a _df suffix to a variable name to indicate it's a DataFrame.
flights_df  = pd.read_csv(data_directory / "flights.csv")
airlines_df = pd.read_csv(data_directory / "airlines.csv")
airports_df = pd.read_csv(data_directory / "airports.csv")
planes_df   = pd.read_csv(data_directory / "planes.csv")
weather_df  = pd.read_csv(data_directory / "weather.csv")

# Convert to datetime
flights_df['time_hour'] = pd.to_datetime(flights_df['time_hour'])
weather_df['time_hour'] = pd.to_datetime(weather_df['time_hour'])

# Remove timezone
weather_df['time_hour'] = weather_df['time_hour'].dt.tz_localize(None)

'''
Convert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.
This was necessary because a merge on date/time types was not possible otherwise.

The dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.
'''

"\nConvert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.\nThis was necessary because a merge on date/time types was not possible otherwise.\n\nThe dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.\n"

In [14]:
# Airlines with CASE logic
airlines_part = (
    airlines_df[['name']]
    .head(15)
    .assign(
        entity_type=lambda df: np.where(
            df['name'].str[0].str.upper().isin(['A','E','I','O','U']),  # .str[0] - takes the first letter of each airline name
            'airlines_vowel',                                           # .str.upper() - capitalizes this letter (so that "A" and "a" are not different)
            'airlines_consonant'                                        # .isin([‘A’,'E',‘I’,'O',‘U’]) - checks whether the word begins with a vowel
        )
    )
)

airports_part = (
    airports_df[['name']]
    .head(5)
    .assign(entity_type='airports')
)

planes_part = (
    planes_df[['manufacturer']]
    .head(5)
    .rename(columns={'manufacturer':'name'})    # Renamed 'manufacturer' to 'name' so that all DataFrames have a common column for names
    .assign(entity_type='manufacturer')
)

union_all = pd.concat([airlines_part, airports_part, planes_part], ignore_index=True)
print(union_all)
print(union_all.shape)

                             name         entity_type
0               Endeavor Air Inc.      airlines_vowel
1          American Airlines Inc.      airlines_vowel
2            Alaska Airlines Inc.      airlines_vowel
3                 JetBlue Airways  airlines_consonant
4            Delta Air Lines Inc.  airlines_consonant
5        ExpressJet Airlines Inc.      airlines_vowel
6          Frontier Airlines Inc.  airlines_consonant
7     AirTran Airways Corporation      airlines_vowel
8          Hawaiian Airlines Inc.  airlines_consonant
9                       Envoy Air      airlines_vowel
10          SkyWest Airlines Inc.  airlines_consonant
11          United Air Lines Inc.      airlines_vowel
12                US Airways Inc.      airlines_vowel
13                 Virgin America  airlines_consonant
14         Southwest Airlines Co.  airlines_consonant
15              Lansdowne Airport            airports
16  Moton Field Municipal Airport            airports
17            Schaumburg Reg

In [18]:
# Airlines that fly from all three NYC airports

intersect = flights_df.loc[flights_df["origin"]=="JFK", "carrier"] \
    .isin(flights_df.loc[flights_df["origin"]=="LGA", "carrier"]) \
    & flights_df.loc[flights_df["origin"]=="JFK", "carrier"] \
    .isin(flights_df.loc[flights_df["origin"]=="EWR", "carrier"])

result = flights_df.loc[flights_df["origin"]=="JFK", "carrier"][intersect].unique()
print(flights_df['carrier'].head(10))

# flights_df.loc[flights_df["origin"]=="JFK", "carrier"] - takes the "carrier" column from rows where ["origin"]=="JFK"
# .isin(flights_df.loc[flights_df["origin"]=="LGA", "carrier"]) - returns a True/False array where "carrier" from ["origin"]=="JFK" also appears among ["origin"]=="LGA"
# Repeated once more, but with "JFK" and "EWR"

# flights_df.loc[flights_df["origin"]=="JFK", "carrier"][intersect].unique()
# Again, takes the carriers from JFK
# [intersect] - filters only the rows where intersect == True
# .unique() - returns the unique carrier values (it is used to avoid repeating the airline names multiple times)

'''
# flights_df["origin"]=="JFK" - creates a boolean array (True/False) for all values in 'origin'. True if "JFK".
# flights_df.loc[...] - selects only those rows where flights_df["origin"]=="JFK" is True.
# ..., "carrier"] - returns only the "carrier" column from the selection.

# .isin() checks if a value is present in another list (e.g., if a carrier from JFK is also in LGA).

# .unique() is used to avoid repeating the airline names multiple times.
'''

'''
# An alternative to .isin, which is more convenient here.

jfk = set(flights_df.loc[flights_df["origin"]=="JFK", "carrier"])
lga = set(flights_df.loc[flights_df["origin"]=="LGA", "carrier"])
ewr = set(flights_df.loc[flights_df["origin"]=="EWR", "carrier"])

# The intersection of three sets
intersect_carriers = jfk & lga & ewr
print(intersect_carriers)

'''


0    UA
1    YV
2    UA
3    DL
4    B6
5    AA
6    UA
7    B6
8    EV
9    US
Name: carrier, dtype: object


'\n# An alternative to .isin, which is more convenient here.\n\njfk = set(flights_df.loc[flights_df["origin"]=="JFK", "carrier"])\nlga = set(flights_df.loc[flights_df["origin"]=="LGA", "carrier"])\newr = set(flights_df.loc[flights_df["origin"]=="EWR", "carrier"])\n\n# The intersection of three sets\nintersect_carriers = jfk & lga & ewr\nprint(intersect_carriers)\n\n'

In [23]:
# Destinations served by both American Airlines and Delta

aa_dests = set(flights_df.loc[flights_df['carrier']=="AA", "dest"])
dl_dests = set(flights_df.loc[flights_df['carrier']=="DL", "dest"])
intersect = aa_dests & dl_dests
print(intersect)
# The data type is set, shown with {} brackets

intersect1 = flights_df.loc[flights_df['carrier']=="AA", "dest"] \
            .isin(flights_df.loc[flights_df['carrier']=="DL", "dest"])
intersect1 = flights_df.loc[flights_df['carrier']=="AA", "dest"][intersect1].unique()
print(intersect1)
# The data type is numpy.ndarray, shown with [] brackets

print(set(intersect1) == intersect)
# Different ordering rules (set() -- arbitrary order, .unique() -- original order)
# The values are identical - only the display order changes

{'LAS', 'TPA', 'FLL', 'SEA', 'AUS', 'SFO', 'STT', 'PBI', 'LAX', 'SJU', 'MIA', 'BOS', 'MCO', 'SAN'}
['BOS' 'SJU' 'MIA' 'SFO' 'LAX' 'STT' 'MCO' 'AUS' 'LAS' 'SEA' 'SAN' 'FLL'
 'TPA' 'PBI']
True


In [None]:
#EXCEPT
# Which airports are arrival destinations but never departure origins?

dest_set = set(flights_df['dest'])
origin_set = set(flights_df['origin'])

result = dest_set - origin_set
print(result)

'''
SELECT dest
FROM flights
EXCEPT
SELECT origin
FROM flights;
'''

{'MHT', 'CHS', 'PIT', 'CAK', 'CAE', 'TPA', 'FLL', 'SEA', 'AUS', 'GRR', 'HNL', 'IND', 'PBI', 'BHM', 'PHX', 'SLC', 'LAS', 'SJC', 'BDL', 'ATL', 'SDF', 'STT', 'ORD', 'ORF', 'RDU', 'DAY', 'SRQ', 'DSM', 'BZN', 'MEM', 'SMF', 'MSP', 'DCA', 'IAD', 'GSP', 'LAX', 'OKC', 'XNA', 'MSY', 'ALB', 'PHL', 'RIC', 'BUF', 'TVC', 'BTV', 'BUR', 'MVY', 'SFO', 'PWM', 'HOU', 'ROC', 'AVL', 'PSE', 'DFW', 'CMH', 'TYS', 'GSO', 'MDW', 'ACK', 'CLE', 'PDX', 'SJU', 'DTW', 'CRW', 'SAV', 'BOS', 'BNA', 'BQN', 'SAN', 'EYW', 'JAC', 'SYR', 'ANC', 'ILM', 'TUL', 'SAT', 'OMA', 'MYR', 'CHO', 'BGR', 'MKE', 'MCO', 'DEN', 'MSN', 'SNA', 'MCI', 'CVG', 'OAK', 'EGE', 'STL', 'JAX', 'CLT', 'ABQ', 'BWI', 'IAH', 'MIA', 'PVD', 'RSW', 'LGB'}


'\nSELECT dest\nFROM flights\nEXCEPT\nSELECT origin\nFROM flights;\n'

In [26]:
# Which airports are departure origins but never arrival destinations?

dest_set = set(flights_df['dest'])
origin_set = set(flights_df['origin'])

result = origin_set - dest_set
print(result)
# So it means all flights are from NYC to other US cities

{'JFK', 'EWR', 'LGA'}


In [None]:
# Aircraft that are in the registry but have never flown

pl_tail = set(planes_df['tailnum'])
fl_tail = set(flights_df['tailnum'].dropna())   # .dropna() removes NaN tailnum in flights_df, IF EXISTS

result = pl_tail - fl_tail

print(len(result), "aircraft have never flown")
print(result)


998 aircraft have never flown
{'N343NB', 'N386SW', 'N602DL', 'N249WN', 'N706SW', 'N7724A', 'N8611F', 'N409WN', 'N305AS', 'N918DH', 'N1604R', 'N694DL', 'N315AS', 'N709UW', 'N828AW', 'N430US', 'N361AA', 'N485WN', 'N913FJ', 'N510MJ', 'N172DZ', 'N506AS', 'N929DN', 'N463WN', 'N206FR', 'N8301J', 'N297PQ', 'N921DN', 'N20904', 'N382SW', 'N118US', 'N760SW', 'N143DA', 'N643DL', 'N779NC', 'N829MH', 'N669SW', 'N68801', 'N797SK', 'N675MC', 'N505MJ', 'N8444F', 'N14629', 'N406US', 'N529VA', 'N594AS', 'N835MH', 'N523SW', 'N8488D', 'N663SW', 'N845MH', 'N649UA', 'N662SW', 'N641DL', 'N900EV', 'N945DN', 'N389AA', 'N708SW', 'N538AS', 'N31131', 'N556NW', 'N269WN', 'N746SK', 'N933LR', 'N66803', 'N762SK', 'N362SW', 'N529AS', 'N923DN', 'N906AT', 'N594NW', 'N7812G', 'N364SW', 'N724UW', 'N558AS', 'N295AT', 'N774NC', 'N626AW', 'N205FR', 'N508MJ', 'N814AW', 'N643SW', 'N773SA', 'N633SW', 'N350NA', 'N306AS', 'N362NW', 'N833AY', 'N664DN', 'N717JL', 'N994AT', 'N632SW', 'N383AA', 'N986AT', 'N177DZ', 'N8623A', 'N295WN',

'\nSELECT tailnum \nFROM planes\nEXCEPT\nSELECT tailnum \nFROM flights \nWHERE tailnum IS NOT NULL; -- 998 самолётов ни разу не летали\n'