In [1]:
import pandas as pd
from pathlib import Path

In [2]:
basic_directory = Path.cwd()                            # directory of the file - folder "code"
data_directory = basic_directory.parent / "data"        # go one level up and choose folder "data"

# It's a common convention to add a _df suffix to a variable name to indicate it's a DataFrame.
flights_df  = pd.read_csv(data_directory / "flights.csv")
airlines_df = pd.read_csv(data_directory / "airlines.csv")
airports_df = pd.read_csv(data_directory / "airports.csv")
planes_df   = pd.read_csv(data_directory / "planes.csv")
weather_df  = pd.read_csv(data_directory / "weather.csv")

# Convert to datetime
flights_df['time_hour'] = pd.to_datetime(flights_df['time_hour'])
weather_df['time_hour'] = pd.to_datetime(weather_df['time_hour'])

# Remove timezone
weather_df['time_hour'] = weather_df['time_hour'].dt.tz_localize(None)

'''
Convert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.
This was necessary because a merge on date/time types was not possible otherwise.

The dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.
'''

"\nConvert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.\nThis was necessary because a merge on date/time types was not possible otherwise.\n\nThe dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.\n"

In [10]:
# Find the aircraft with the most flights and its airline

highest_no = planes_df.merge(flights_df, on='tailnum', how='left')
highest_no = highest_no.merge(airlines_df, on='carrier', how='left')

result = (
    highest_no
    .groupby(['tailnum', 'carrier', 'name'])
    .size()                                       # Equivalent to COUNT(*)
    .reset_index(name='flight_count')             # Rename the resulting column
    .sort_values('flight_count', ascending=False) # ORDER BY flight_count DESC
)
print(result)
# The aircraft with the most flights are N711MQ and N335AA, with 18 flights
'''
Original SQL: Find the aircraft with the most flights and its airline

SELECT pl.tailnum, fl.carrier, al.name AS airline_name, COUNT(*) AS flight_count
FROM planes AS pl
LEFT JOIN flights AS fl
USING(tailnum)
LEFT JOIN airlines AS al 
USING(carrier)
GROUP BY pl.tailnum, fl.carrier, al.name  -- Group by all three fields
ORDER BY flight_count DESC;

The aircraft with the most flights are N711MQ and N335AA, with 18 flights
'''

     tailnum carrier                         name  flight_count
616   N335AA      AA       American Airlines Inc.            18
1599  N711MQ      MQ                    Envoy Air            18
315   N183JB      B6              JetBlue Airways            17
629   N339AA      AA       American Airlines Inc.            17
1420  N632JB      B6              JetBlue Airways            17
...      ...     ...                          ...           ...
2257  N960AT      FL  AirTran Airways Corporation             1
2259  N961AT      FL  AirTran Airways Corporation             1
937   N426AA      AA       American Airlines Inc.             1
2264  N963AT      FL  AirTran Airways Corporation             1
2280  N969AT      FL  AirTran Airways Corporation             1

[2328 rows x 4 columns]


'\nOriginal SQL: Find the aircraft with the most flights and its airline\n\nSELECT pl.tailnum, fl.carrier, al.name AS airline_name, COUNT(*) AS flight_count\nFROM planes AS pl\nLEFT JOIN flights AS fl\nUSING(tailnum)\nLEFT JOIN airlines AS al \nUSING(carrier)\nGROUP BY pl.tailnum, fl.carrier, al.name  -- Group by all three fields\nORDER BY flight_count DESC;\n\nThe aircraft with the most flights are N711MQ and N335AA, with 18 flights\n'

In [8]:
# Before splitting into carriers, there were 3322 lines; after splitting, there were 3326 lines.
# The query above shows which aircraft and how many flew for more than one airline

result = (
    flights_df
    .groupby('tailnum')['carrier']
    .nunique()                                    # Equivalent to COUNT(DISTINCT carrier)
    .reset_index(name='carriers_count')           # Move 'tailnum' from index to a column
    .query("carriers_count > 1")                  # Equivalent to HAVING
    .sort_values('carriers_count', ascending=False)
)
print(result.head())

'''
SELECT tailnum, COUNT(DISTINCT carrier) as carriers_count
FROM flights
GROUP BY tailnum
HAVING COUNT(DISTINCT carrier) > 1
ORDER BY carriers_count DESC;
'''

     tailnum  carriers_count
414   N228PQ               2
423   N232PQ               2
2770  N977AT               2
2789  N990AT               2


'\nSELECT tailnum, COUNT(DISTINCT carrier) as carriers_count\nFROM flights\nGROUP BY tailnum\nHAVING COUNT(DISTINCT carrier) > 1\nORDER BY carriers_count DESC;\n'

In [None]:
#A simple query for FULL JOIN

full_join = airlines_df.merge(flights_df, on='carrier', how='outer')

result = (
    full_join
    .loc[full_join['carrier'].isin(['UA', 'AA']),
    ['carrier', 'name', 'month', 'minute', 'dep_time']]
)
print(result)
# .isin checks if a value in a column is present in the provided list.
'''
Original SQL: A simple query for FULL JOIN

SELECT carrier, al.name, fl.month, fl.minute, fl.dep_time
FROM airlines as al
FULL JOIN flights as fl
USING(carrier)
WHERE carrier='UA' OR carrier='AA';
'''

     carrier                    name  month  minute  dep_time
521       AA  American Airlines Inc.      5      55    1951.0
522       AA  American Airlines Inc.      7      45    1336.0
523       AA  American Airlines Inc.      4       0    1458.0
524       AA  American Airlines Inc.      3      30     847.0
525       AA  American Airlines Inc.      6      15    1311.0
...      ...                     ...    ...     ...       ...
8847      UA   United Air Lines Inc.      9      30     628.0
8848      UA   United Air Lines Inc.      1       0    1949.0
8849      UA   United Air Lines Inc.      4      45     841.0
8850      UA   United Air Lines Inc.     12       5    1502.0
8851      UA   United Air Lines Inc.     12      56    1803.0

[2699 rows x 5 columns]


"\n-- Простой запрос на FULL JOIN\nSELECT carrier, al.name, fl.month, fl.minute, fl.dep_time\nFROM airlines as al\nFULL JOIN flights as fl\nUSING(carrier)\nWHERE carrier='UA' OR carrier='AA';\n"

In [None]:
# A simple query for CROSS JOIN

cross_join = flights_df.merge(airlines_df, how='cross', suffixes=("_flights", "_airlines"))

result = (
    cross_join
    .loc[(cross_join['origin']=='JFK') & (cross_join['carrier_airlines']=='HA'),
         ['dest', 'origin', 'name']]
)
print(result)


# Each row from flights_df is joined with every row from airlines_df.
# If carrier_flights == "HA", it would ONLY select rows where the carrier in FLIGHTS_DF is Hawaiian Airlines.
# If carrier_airlines == "HA", it selects ALL flights but only keeps rows where the AIRLINES_DF lookup points to HAWAIIAN.

'''
Original SQL: A simple query for CROSS JOIN

SELECT fl.dest, fl.origin, al.name
FROM flights AS fl
CROSS JOIN airlines AS al
WHERE fl.origin IN ('JFK') AND al.carrier IN ('HA');
'''

       dest origin                    name
72      TPA    JFK  Hawaiian Airlines Inc.
88      BOS    JFK  Hawaiian Airlines Inc.
104     SFO    JFK  Hawaiian Airlines Inc.
120     IAD    JFK  Hawaiian Airlines Inc.
168     CHS    JFK  Hawaiian Airlines Inc.
...     ...    ...                     ...
159880  IND    JFK  Hawaiian Airlines Inc.
159896  SAN    JFK  Hawaiian Airlines Inc.
159912  SEA    JFK  Hawaiian Airlines Inc.
159928  LAX    JFK  Hawaiian Airlines Inc.
159960  PBI    JFK  Hawaiian Airlines Inc.

[3276 rows x 3 columns]


"\n-- Простой запрос на CROSS JOIN\nSELECT fl.dest, fl.origin, al.name\nFROM flights AS fl\nCROSS JOIN airlines AS al\nWHERE fl.origin IN ('JFK') AND al.carrier IN ('HA');\n"

In [11]:
# A simple query for CROSS JOIN

self_join = flights_df.merge(flights_df, on=(['month','day']))

result = (
    flights_df
    .merge(flights_df, on="month", suffixes=("_fl1", "_fl2"))
    .query("day_fl1 != day_fl2") # Equivalent to WHERE
    [["day_fl1", "day_fl2", "month"]]
)
print(result.head(40))

'''
Original SQL: SELF JOIN (comparing a column with different values within the same table).

-- This query compares days within the same month from the flights table.
SELECT fl1.day AS day1, fl2.day AS day2, fl1.month
FROM flights AS fl1
INNER JOIN flights AS fl2
ON fl1.month=fl2.month
AND fl1.day<>fl2.day
LIMIT 40;
/* The query logic (simplified):
It first takes "January 1st with all other days in January",
then "January 2nd with all other days", and so on.
*/;
'''

    day_fl1  day_fl2  month
1        27        9      6
2        27        4      6
3        27       17      6
4        27       14      6
5        27       24      6
6        27        7      6
7        27       28      6
8        27       26      6
9        27       19      6
10       27       22      6
11       27        2      6
12       27       21      6
13       27       30      6
14       27       13      6
15       27       20      6
16       27       16      6
17       27       18      6
18       27       17      6
19       27        3      6
20       27       12      6
21       27       23      6
22       27       23      6
23       27       22      6
24       27       28      6
26       27        3      6
27       27       24      6
28       27        1      6
29       27        3      6
30       27        1      6
31       27       20      6
32       27       28      6
33       27        7      6
34       27       30      6
35       27       20      6
36       27        4

'\nOriginal SQL: SELF JOIN (comparing a column with different values within the same table).\n\n-- This query compares days within the same month from the flights table.\nSELECT fl1.day AS day1, fl2.day AS day2, fl1.month\nFROM flights AS fl1\nINNER JOIN flights AS fl2\nON fl1.month=fl2.month\nAND fl1.day<>fl2.day\nLIMIT 40;\n/* The query logic (simplified):\nIt first takes "January 1st with all other days in January",\nthen "January 2nd with all other days", and so on.\n*/;\n'

In [21]:
# This SQL query finds pairs of flights with the same destination (dest)
# that arrive on the same day with an interval of no more than 5 minutes

same_time = flights_df.merge(flights_df, on=["dest", "day", "month"], suffixes=("_fl1", "_fl2"))

result = (
    same_time
    .loc[
         (same_time["sched_arr_time_fl1"] > same_time["sched_arr_time_fl2"]) &
         ((same_time["sched_arr_time_fl1"] - same_time["sched_arr_time_fl2"]) <= 3),
         ["flight_fl1", "flight_fl2", "origin_fl1", "origin_fl2", "dest", "sched_arr_time_fl1", "sched_arr_time_fl2"]
    ]
    .assign(difference=lambda df: df["sched_arr_time_fl1"] - df["sched_arr_time_fl2"])
    .sort_values("difference", ascending=True)
)
print(result)
print(result.shape)
'''
Original SQL:

SELECT fl1.flight AS first_flight, fl2.flight AS second_flight, fl1.origin AS first_origin, fl2.origin AS second_origin, fl1.dest, fl2.sched_arr_time AS second_arr_time, fl1.sched_arr_time AS first_arr_time, fl1.sched_arr_time - fl2.sched_arr_time AS difference
FROM flights AS fl1
INNER JOIN flights AS fl2
ON fl1.dest = fl2.dest
AND fl1.day = fl2.day
AND fl1.month = fl2.month
AND fl1.sched_arr_time > fl2.sched_arr_time
AND (fl1.sched_arr_time - fl2.sched_arr_time) <= 5
ORDER BY difference ASC;
'''

       flight_fl1  flight_fl2 origin_fl1 origin_fl2 dest  sched_arr_time_fl1  \
1051          127         695        EWR        JFK  MCO                1350   
1313          409         219        EWR        JFK  CLT                1023   
1767         4440           8        EWR        JFK  BUF                1556   
3419          763         399        JFK        JFK  LAX                1011   
4958         1047        2357        EWR        JFK  SAN                2158   
5122          745        3676        EWR        LGA  CLT                 826   
11314         177        1495        JFK        EWR  SFO                2210   
2701          935        2247        EWR        LGA  ATL                1631   
12652        5207        2095        LGA        EWR  CLT                1507   
4583          245         561        EWR        LGA  DEN                1024   
11539         501         371        JFK        LGA  FLL                 854   
12405         348         781        LGA

'\nOriginal SQL:\n\nSELECT fl1.flight AS first_flight, fl2.flight AS second_flight, fl1.origin AS first_origin, fl2.origin AS second_origin, fl1.dest, fl2.sched_arr_time AS second_arr_time, fl1.sched_arr_time AS first_arr_time, fl1.sched_arr_time - fl2.sched_arr_time AS difference\nFROM flights AS fl1\nINNER JOIN flights AS fl2\nON fl1.dest = fl2.dest\nAND fl1.day = fl2.day\nAND fl1.month = fl2.month\nAND fl1.sched_arr_time > fl2.sched_arr_time\nAND (fl1.sched_arr_time - fl2.sched_arr_time) <= 5\nORDER BY difference ASC;\n'