In [1]:
import pandas as pd
from pathlib import Path

In [2]:
basic_directory = Path.cwd()                            # directory of the file - folder "code"
data_directory = basic_directory.parent / "data"        # go one level up and choose folder "data"

# It's a common convention to add a _df suffix to a variable name to indicate it's a DataFrame.
flights_df  = pd.read_csv(data_directory / "flights.csv")
airlines_df = pd.read_csv(data_directory / "airlines.csv")
airports_df = pd.read_csv(data_directory / "airports.csv")
planes_df   = pd.read_csv(data_directory / "planes.csv")
weather_df  = pd.read_csv(data_directory / "weather.csv")

# Convert to datetime
flights_df['time_hour'] = pd.to_datetime(flights_df['time_hour'])
weather_df['time_hour'] = pd.to_datetime(weather_df['time_hour'])

# Remove timezone
weather_df['time_hour'] = weather_df['time_hour'].dt.tz_localize(None)

'''
Convert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.
This was necessary because a merge on date/time types was not possible otherwise.

The dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.
'''

"\nConvert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.\nThis was necessary because a merge on date/time types was not possible otherwise.\n\nThe dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.\n"

In [None]:
# Find the aircraft with the most flights and its airline

highest_no = planes_df.merge(flights_df, on='tailnum', how='left')
highest_no = highest_no.merge(airlines_df, on='carrier', how='left')

result = (
    highest_no
    .groupby(['tailnum', 'carrier', 'name'], dropna=False)
    .size()                                       # Equivalent to COUNT(*)
    .reset_index(name='flight_count')             # Rename the resulting column
    .sort_values('flight_count', ascending=False) # ORDER BY flight_count DESC
)
print(result)
# The aircraft with the most flights are N711MQ and N335AA, with 18 flights

     tailnum carrier                         name  flight_count
616   N335AA      AA       American Airlines Inc.            18
1599  N711MQ      MQ                    Envoy Air            18
315   N183JB      B6              JetBlue Airways            17
629   N339AA      AA       American Airlines Inc.            17
1420  N632JB      B6              JetBlue Airways            17
...      ...     ...                          ...           ...
2257  N960AT      FL  AirTran Airways Corporation             1
2259  N961AT      FL  AirTran Airways Corporation             1
937   N426AA      AA       American Airlines Inc.             1
2264  N963AT      FL  AirTran Airways Corporation             1
2280  N969AT      FL  AirTran Airways Corporation             1

[2328 rows x 4 columns]


So, in this query in SQL NaN values are INCLUDED, so even if aircraft has never flown, it'll be in our result.
In pandas groupby NaN values ARE DROPED by default, so there were 998 less observations (these aircrafts that has never flown).

1. If I want (in pandas) to include all values - add dropna=False in groupby()
2. If I want (SQL) to drop NaN (NULL in SQL) values in SQL - just use INNER JOIN (in THIS case its useful)

In [None]:
# Before splitting into carriers, there were 3322 lines; after splitting, there were 3326 lines.
# The query above shows which aircraft and how many flew for more than one airline

result = (
    flights_df
    .groupby('tailnum')['carrier']
    .nunique()                                    # Equivalent to COUNT(DISTINCT carrier)
    .reset_index(name='carriers_count')           # Move 'tailnum' from index to a column
    .query("carriers_count > 1")                  # Equivalent to HAVING
    .sort_values('carriers_count', ascending=False)
)
print(result.head())

     tailnum  carriers_count
414   N228PQ               2
423   N232PQ               2
2770  N977AT               2
2789  N990AT               2


'\nSELECT tailnum, COUNT(DISTINCT carrier) as carriers_count\nFROM flights\nGROUP BY tailnum\nHAVING COUNT(DISTINCT carrier) > 1\nORDER BY carriers_count DESC;\n'

In [None]:
# A simple query for FULL JOIN

full_join = airlines_df.merge(flights_df, on='carrier', how='outer')

result = (
    full_join
    .loc[full_join['carrier'].isin(['UA', 'AA']) & (full_join['month']==6),     # row filter
    ['carrier', 'name', 'month', 'minute', 'dep_time']]
)
print(result)
# .isin checks if a value in a column is present in the provided list.

# equivalent with .query()
'''
result = full_join.query("carrier in ['UA', 'AA'] and month == 6")[['carrier', 'name', 'month', 'minute', 'dep_time']]
'''

     carrier                    name  month  minute  dep_time
525       AA  American Airlines Inc.      6      15    1311.0
530       AA  American Airlines Inc.      6      30    1148.0
533       AA  American Airlines Inc.      6       0     656.0
550       AA  American Airlines Inc.      6       0    1958.0
572       AA  American Airlines Inc.      6      15    1134.0
...      ...                     ...    ...     ...       ...
8796      UA   United Air Lines Inc.      6      40     900.0
8817      UA   United Air Lines Inc.      6      29       NaN
8818      UA   United Air Lines Inc.      6      47    1145.0
8826      UA   United Air Lines Inc.      6      10     605.0
8828      UA   United Air Lines Inc.      6      12    1008.0

[236 rows x 5 columns]


'\n\nresult = full_join.query("carrier in [\'UA\', \'AA\'] and month == 6")[[\'carrier\', \'name\', \'month\', \'minute\', \'dep_time\']]\n\n'

In [None]:
# A simple query for CROSS JOIN

cross_join = flights_df.merge(airlines_df, how='cross', suffixes=("_flights", "_airlines"))

result = (
    cross_join
    .loc[(cross_join['origin']=='JFK') & (cross_join['carrier_airlines']=='HA'),
         ['dest', 'origin', 'name']]
)
print(result)


# Each row from flights_df is joined with every row from airlines_df.
# If carrier_flights == "HA", it would ONLY select rows where the carrier in FLIGHTS_DF is Hawaiian Airlines.
# If carrier_airlines == "HA", it selects ALL flights but only keeps rows where the AIRLINES_DF lookup points to HAWAIIAN.

       dest origin                    name
72      TPA    JFK  Hawaiian Airlines Inc.
88      BOS    JFK  Hawaiian Airlines Inc.
104     SFO    JFK  Hawaiian Airlines Inc.
120     IAD    JFK  Hawaiian Airlines Inc.
168     CHS    JFK  Hawaiian Airlines Inc.
...     ...    ...                     ...
159880  IND    JFK  Hawaiian Airlines Inc.
159896  SAN    JFK  Hawaiian Airlines Inc.
159912  SEA    JFK  Hawaiian Airlines Inc.
159928  LAX    JFK  Hawaiian Airlines Inc.
159960  PBI    JFK  Hawaiian Airlines Inc.

[3276 rows x 3 columns]


"\nOriginal SQL: A simple query for CROSS JOIN\n\nSELECT fl.dest, fl.origin, al.name\nFROM flights AS fl\nCROSS JOIN airlines AS al\nWHERE fl.origin IN ('JFK') AND al.carrier IN ('HA');\n"

In [None]:
# A simple query for SELF JOIN

self_join = flights_df.merge(flights_df, on='month', suffixes=("_fl1", "_fl2"))

result = (
    self_join
    .query("day_fl1 != day_fl2") # Equivalent to WHERE
    [["day_fl1", "day_fl2", "month"]]
)
print(result)

# This query compares days within the same month from the flights table

# The query logic (simplified):
# It first takes "January 1st with all other days in January",
# then "January 2nd with all other days", and so on.

         day_fl1  day_fl2  month
1             27        9      6
2             27        4      6
3             27       17      6
4             27       14      6
5             27       24      6
...          ...      ...    ...
8359012       18        4     11
8359013       18        4     11
8359014       18       19     11
8359015       18        5     11
8359016       18       16     11

[8073012 rows x 3 columns]


'\nOriginal SQL: SELF JOIN (comparing a column with different values within the same table).\n\n-- This query compares days within the same month from the flights table.\nSELECT fl1.day AS day1, fl2.day AS day2, fl1.month\nFROM flights AS fl1\nINNER JOIN flights AS fl2\nON fl1.month=fl2.month\nAND fl1.day<>fl2.day\nLIMIT 40;\n/* The query logic (simplified):\nIt first takes "January 1st with all other days in January",\nthen "January 2nd with all other days", and so on.\n*/;\n'

In [4]:
# This SQL query finds pairs of flights with the same destination (dest)
# that arrive on the same day with an interval of no more than 5 minutes

same_time = flights_df.merge(flights_df, on=["dest", "day", "month"], suffixes=("_fl1", "_fl2"))

result = (
    same_time
    .loc[
         (same_time["sched_arr_time_fl1"] > same_time["sched_arr_time_fl2"]) &
         ((same_time["sched_arr_time_fl1"] - same_time["sched_arr_time_fl2"]) <= 5),
         ["flight_fl1", "flight_fl2", "origin_fl1", "origin_fl2", "dest", "sched_arr_time_fl1", "sched_arr_time_fl2"]
    ]
    .assign(difference=lambda df:    df["sched_arr_time_fl1"] - df["sched_arr_time_fl2"])  # Creating new column "difference" with value equral to diff. between "...fl1" and "...fl2"
    .sort_values("difference", ascending=True )
)
print(result)
print(result.shape)

'''
# Equivalent without .assign:

result = (
    same_time
    .loc[
         (same_time["sched_arr_time_fl1"] > same_time["sched_arr_time_fl2"]) &
         ((same_time["sched_arr_time_fl1"] - same_time["sched_arr_time_fl2"]) <= 3),
         ["flight_fl1", "flight_fl2", "origin_fl1", "origin_fl2", "dest", "sched_arr_time_fl1", "sched_arr_time_fl2"]
    ]
)
result['difference'] = result['sched_arr_time_fl1'] - result['sched_arr_time_fl2']
result = result.sort_values('difference', ascending=True)
'''

       flight_fl1  flight_fl2 origin_fl1 origin_fl2 dest  sched_arr_time_fl1  \
1051          127         695        EWR        JFK  MCO                1350   
1313          409         219        EWR        JFK  CLT                1023   
1767         4440           8        EWR        JFK  BUF                1556   
3419          763         399        JFK        JFK  LAX                1011   
5122          745        3676        EWR        LGA  CLT                 826   
4958         1047        2357        EWR        JFK  SAN                2158   
11314         177        1495        JFK        EWR  SFO                2210   
2701          935        2247        EWR        LGA  ATL                1631   
12652        5207        2095        LGA        EWR  CLT                1507   
12405         348         781        LGA        LGA  ATL                1535   
11539         501         371        JFK        LGA  FLL                 854   
4583          245         561        EWR

'\n# Equivalent without .assign:\n\nresult = (\n    same_time\n    .loc[\n         (same_time["sched_arr_time_fl1"] > same_time["sched_arr_time_fl2"]) &\n         ((same_time["sched_arr_time_fl1"] - same_time["sched_arr_time_fl2"]) <= 3),\n         ["flight_fl1", "flight_fl2", "origin_fl1", "origin_fl2", "dest", "sched_arr_time_fl1", "sched_arr_time_fl2"]\n    ]\n)\nresult[\'difference\'] = result[\'sched_arr_time_fl1\'] - result[\'sched_arr_time_fl2\']\nresult = result.sort_values(\'difference\', ascending=True)\n'