In [2]:
import pandas as pd
from pathlib import Path

In [3]:
basic_directory = Path.cwd()                            # directory of the file - folder "code"
data_directory = basic_directory.parent / "data"        # go one level up and choose folder "data"

# It's a common convention to add a _df suffix to a variable name to indicate it's a DataFrame.
flights_df  = pd.read_csv(data_directory / "flights.csv")
airlines_df = pd.read_csv(data_directory / "airlines.csv")
airports_df = pd.read_csv(data_directory / "airports.csv")
planes_df   = pd.read_csv(data_directory / "planes.csv")
weather_df  = pd.read_csv(data_directory / "weather.csv")

# Convert to datetime
flights_df['time_hour'] = pd.to_datetime(flights_df['time_hour'])
weather_df['time_hour'] = pd.to_datetime(weather_df['time_hour'])

# Remove timezone
weather_df['time_hour'] = weather_df['time_hour'].dt.tz_localize(None)

'''
Convert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.
This was necessary because a merge on date/time types was not possible otherwise.

The dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.
'''

"\nConvert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.\nThis was necessary because a merge on date/time types was not possible otherwise.\n\nThe dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.\n"

In [3]:
# WHERE clause
# Рейсы, которые выполнялись на самолетах Boeing 737

planes = planes_df.loc[planes_df['model'].str.startswith('737'), 'tailnum']
flights = flights_df[flights_df['tailnum'].isin(planes)]
'''
flights = flights_df.loc[
    flights_df['tailnum'].isin(planes),
    ["year", "month", "day", "carrier", "flight", "tailnum", "origin", "dest"]
]
если бы хотел выбрать конкретные колонки
'''
print(flights)
'''
SELECT *
FROM flights
WHERE tailnum IN (
    SELECT tailnum
    FROM planes
    WHERE model LIKE '737%'
);
'''

      year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0     2013      6   27    1335.0            1214       81.0    1426.0   
2     2013      7   31    2007.0            2007        0.0    2358.0   
11    2013      4   16    1827.0            1829       -2.0    2129.0   
12    2013      1   24    1123.0            1125       -2.0    1607.0   
17    2013      7   26    2042.0            2004       38.0    2327.0   
...    ...    ...  ...       ...             ...        ...       ...   
9968  2013      1   29    1949.0            1800      109.0    2258.0   
9971  2013      8   13    1815.0            1700       75.0    2056.0   
9972  2013      4   14     841.0             845       -4.0    1159.0   
9991  2013      6    6     744.0             745       -1.0     905.0   
9993  2013     12   21    1845.0            1836        9.0    2143.0   

      sched_arr_time  arr_delay carrier  flight tailnum origin dest  air_time  \
0               1335       51.0      UA   

"\nSELECT *\nFROM flights\nWHERE tailnum IN (\n    SELECT tailnum\n    FROM planes\n    WHERE model LIKE '737%'\n);\n"

In [4]:
# Как и первый запрос, только в колонкой 'model' из planes_df в flights_df

planes_737 = planes_df.loc[planes_df['model'].str.startswith('737'), ['tailnum', 'model']]

merged = flights_df.merge(planes_737, on='tailnum') # таблица, в которой есть ВСЕ колонки из planes_737 и из flights_df.

result = merged[["year", "month", "day", "carrier", "flight", "tailnum", "origin", "dest", "model"]] # просто выбираю какие колонки мне нужны
print(result)

      year  month  day carrier  flight tailnum origin dest      model
0     2013      6   27      UA    1037  N34222    EWR  BOS    737-824
1     2013      7   31      UA    1244  N39418    EWR  SJU  737-924ER
2     2013      4   16      UA    1219  N28457    EWR  SEA  737-924ER
3     2013      1   24      DL     315  N3764D    JFK  SJU    737-832
4     2013      7   26      UA    1695  N13716    EWR  IAH    737-724
...    ...    ...  ...     ...     ...     ...    ...  ...        ...
1514  2013      1   29      UA    1139  N16709    EWR  DFW    737-724
1515  2013      8   13      WN    3928  N797MX    EWR  HOU    737-7H4
1516  2013      4   14      UA    1549  N13750    LGA  IAH    737-724
1517  2013      6    6      WN    3841  N916WN    EWR  MDW    737-7H4
1518  2013     12   21      DL    2357  N3749D    JFK  SAN    737-832

[1519 rows x 9 columns]


In [5]:
# Рейсы в аэропорты на восточном побережье

airports = airports_df.loc[airports_df['lon']>-80, 'faa']
flights = flights_df.loc[flights_df['dest'].isin(airports),
                         ["year", "month", "day", "carrier", "flight", "tailnum", "origin", "dest"]
                         ]
'''
или просто flights = flights_df[flights_df['dest'].isin(airports)]
тогда все колонки
'''
print(flights)
'''
SELECT *
FROM flights
WHERE dest IN (
    SELECT faa
    FROM airports
    WHERE lon > -80  -- восточнее 80° западной долготы
);
'''

      year  month  day carrier  flight tailnum origin dest
0     2013      6   27      UA    1037  N34222    EWR  BOS
5     2013      5   31      AA    1762  N3AWAA    JFK  BOS
7     2013      4    1      B6    1307  N348JB    JFK  IAD
9     2013      3   29      US    2126  N956UW    LGA  BOS
26    2013      7   31      US    2144  N956UW    LGA  BOS
...    ...    ...  ...     ...     ...     ...    ...  ...
9975  2013      5   14      AA     854  N3DXAA    JFK  BOS
9978  2013      8   28      B6     118  N355JB    JFK  BOS
9979  2013      2    8      B6      42  N192JB    JFK  SYR
9988  2013      1    8      9E    4220  N824AY    JFK  RDU
9999  2013     11   18      EV    4695  N11548    EWR  RDU

[1868 rows x 8 columns]


'\nSELECT *\nFROM flights\nWHERE dest IN (\n    SELECT faa\n    FROM airports\n    WHERE lon > -80  -- восточнее 80° западной долготы\n);\n'

In [6]:
# SELECT clause

# Для каждого аэропорта показать количество рейсов из него
counts = flights_df.groupby('origin').size().reset_index(name='number') # Кол-во рейсов по каждому origin

result = (
    flights_df[['origin']]
    .drop_duplicates()              # аналог SELECT DISTINCT origin
    .merge(counts, on='origin')     # добавляется число рейсов из counts
)

print(result)

'''
SELECT DISTINCT origin, 
    (SELECT COUNT(*)
    FROM flights AS f2
    WHERE f1.origin = f2.origin
    ) AS number
FROM flights AS f1;
'''

  origin  number
0    EWR    3645
1    LGA    3079
2    JFK    3276


'\nSELECT DISTINCT origin, \n    (SELECT COUNT(*)\n    FROM flights AS f2\n    WHERE f1.origin = f2.origin\n    ) AS number\nFROM flights AS f1;\n'

In [7]:
# Вместо подзапроса в SELECT

query = flights_df.groupby('origin').size().reset_index(name='total_flights')
print(query)
'''
SELECT origin, COUNT(*) AS total_flights
FROM flights
GROUP BY origin;
'''

  origin  total_flights
0    EWR           3645
1    JFK           3276
2    LGA           3079


'\nSELECT origin, COUNT(*) AS total_flights\nFROM flights\nGROUP BY origin;\n'

In [8]:
# Для каждого самолета показать общее количество полетов
notnull = flights_df[flights_df['tailnum'].notna()]

counts = notnull.groupby('tailnum').size().reset_index(name="number") 
result = (
    flights_df[['tailnum']]
    .drop_duplicates()
    .merge(counts, on='tailnum')
    .head(10)
)
print(result)
'''
SELECT DISTINCT tailnum,
       (SELECT COUNT(*)
        FROM flights f2
        WHERE f2.tailnum = f1.tailnum) AS flight_count
FROM flights f1
WHERE tailnum IS NOT NULL
LIMIT 10;
'''

  tailnum  number
0  N34222       3
1  N926LR       3
2  N39418       4
3  N960DL       4
4  N615JB      16
5  N3AWAA       2
6  N557UA       9
7  N348JB      13
8  N17159       8
9  N956UW       5


'\nSELECT DISTINCT tailnum,\n       (SELECT COUNT(*)\n        FROM flights f2\n        WHERE f2.tailnum = f1.tailnum) AS flight_count\nFROM flights f1\nWHERE tailnum IS NOT NULL\nLIMIT 10;\n'

In [9]:
# FROM clause
route_stats = (
    flights_df.groupby("tailnum")["distance"]
    .mean().round(2)
    .reset_index(name="avg_distance")
)

result = planes_df.merge(route_stats, on="tailnum")
result = result[["tailnum", "manufacturer", "model", "avg_distance"]].sort_values("avg_distance", ascending=False)

print(result)


'''
SELECT pl.tailnum, pl.manufacturer, pl.model,
       route_stats.avg_distance
FROM planes AS pl,
    (SELECT tailnum, ROUND(AVG(distance), 2) AS avg_distance
    FROM flights AS fl
    GROUP BY tailnum) AS route_stats
WHERE pl.tailnum = route_stats.tailnum
ORDER BY avg_distance DESC;
'''

     tailnum    manufacturer            model  avg_distance
851   N388HA          AIRBUS         A330-243       4983.00
862   N392HA          AIRBUS         A330-243       4983.00
859   N391HA          AIRBUS         A330-243       4983.00
853   N389HA          AIRBUS         A330-243       4983.00
844   N384HA          AIRBUS         A330-243       4983.00
...      ...             ...              ...           ...
2273  N967UW         EMBRAER  ERJ 190-100 IGW        154.67
2239  N955UW         EMBRAER  ERJ 190-100 IGW        148.80
2037  N8968E  BOMBARDIER INC      CL-600-2B19         94.00
2022  N8928A  BOMBARDIER INC      CL-600-2B19         94.00
1874  N829AY  BOMBARDIER INC      CL-600-2B19         94.00

[2324 rows x 4 columns]


'\nSELECT pl.tailnum, pl.manufacturer, pl.model,\n       route_stats.avg_distance\nFROM planes AS pl,\n    (SELECT tailnum, ROUND(AVG(distance), 2) AS avg_distance\n    FROM flights AS fl\n    GROUP BY tailnum) AS route_stats\nWHERE pl.tailnum = route_stats.tailnum\nORDER BY avg_distance DESC;\n'

In [10]:
delay_stats = (
    flights_df[flights_df['dep_delay'].notna()]
    .groupby(['carrier', 'month'])['dep_delay']
    .agg(avg_delay='mean', flight_count='size')
    .reset_index()
)

delay_stats['avg_delay'] = delay_stats['avg_delay'].round(2)

result = airlines_df.merge(delay_stats, on='carrier')
result = result[result['avg_delay'] > 15]
result = result[['name', 'carrier', 'month', 'avg_delay', 'flight_count']].sort_values('avg_delay', ascending=False)

print(result.head(5))
print(result.shape)

'''
SELECT a.name, a.carrier,
       delay_stats.month,
       delay_stats.avg_delay,
       delay_stats.flight_count
FROM airlines AS a,
     (SELECT carrier, month,
             ROUND(AVG(dep_delay), 2) AS avg_delay,
             COUNT(*) AS flight_count
      FROM flights
      WHERE dep_delay IS NOT NULL
      GROUP BY carrier, month) AS delay_stats
WHERE a.carrier = delay_stats.carrier
  AND delay_stats.avg_delay > 15
ORDER BY delay_stats.avg_delay DESC;
'''

                            name carrier  month  avg_delay  flight_count
85   AirTran Airways Corporation      FL      6     140.00             2
74        Frontier Airlines Inc.      F9      7      88.00             1
161           Mesa Airlines Inc.      YV      1      78.00             1
1              Endeavor Air Inc.      9E      2      50.13            31
156       Southwest Airlines Co.      WN      8      45.39            28
(49, 5)


'\nSELECT a.name, a.carrier,\n       delay_stats.month,\n       delay_stats.avg_delay,\n       delay_stats.flight_count\nFROM airlines AS a,\n     (SELECT carrier, month,\n             ROUND(AVG(dep_delay), 2) AS avg_delay,\n             COUNT(*) AS flight_count\n      FROM flights\n      WHERE dep_delay IS NOT NULL\n      GROUP BY carrier, month) AS delay_stats\nWHERE a.carrier = delay_stats.carrier\n  AND delay_stats.avg_delay > 15\nORDER BY delay_stats.avg_delay DESC;\n'