In [1]:
import pandas as pd
from pathlib import Path

In [2]:
basic_directory = Path.cwd()                            # directory of the file - folder "code"
data_directory = basic_directory.parent / "data"        # go one level up and choose folder "data"

# It's a common convention to add a _df suffix to a variable name to indicate it's a DataFrame.
flights_df  = pd.read_csv(data_directory / "flights.csv")
airlines_df = pd.read_csv(data_directory / "airlines.csv")
airports_df = pd.read_csv(data_directory / "airports.csv")
planes_df   = pd.read_csv(data_directory / "planes.csv")
weather_df  = pd.read_csv(data_directory / "weather.csv")

# Convert to datetime
flights_df['time_hour'] = pd.to_datetime(flights_df['time_hour'])
weather_df['time_hour'] = pd.to_datetime(weather_df['time_hour'])

# Remove timezone
weather_df['time_hour'] = weather_df['time_hour'].dt.tz_localize(None)

'''
Convert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.
This was necessary because a merge on date/time types was not possible otherwise.

The dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.
'''

"\nConvert the 'time_hour' column in flights_df and weather_df to datetime64[ns] type, as it was 'object'.\nThis was necessary because a merge on date/time types was not possible otherwise.\n\nThe dtype for flights_df was datetime64[ns], while for weather_df it was datetime64[ns, UTC]. Removed the UTC timezone.\n"

In [3]:
# All flights of aircraft N14228 and display the required columns

flights_planes = flights_df.merge(planes_df, on='tailnum')

print(flights_planes.loc[flights_planes['tailnum'] == 'N14228', 
                        ['flight', 'carrier', 'origin', 'dest', 'manufacturer', 'model']])

      flight carrier origin dest manufacturer    model
398     1416      UA    EWR  AUS       BOEING  737-824
1704    1071      UA    EWR  BQN       BOEING  737-824
3738    1207      UA    EWR  BOS       BOEING  737-824
8185    1707      UA    EWR  TPA       BOEING  737-824


In [4]:
# Flights to Los Angeles

flights_airports = flights_df.merge(airports_df, left_on='dest', right_on='faa')

print(flights_airports.loc[flights_airports['name'] == 'Los Angeles Intl', 
                          ['flight', 'carrier', 'origin', 'dest', 'name', 'lat', 'lon']])

      flight carrier origin dest              name        lat         lon
85       399      VX    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
92         1      AA    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
127      415      VX    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
165     1163      DL    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
194      133      AA    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
...      ...     ...    ...  ...               ...        ...         ...
9742     165      VX    EWR  LAX  Los Angeles Intl  33.942536 -118.408075
9749     763      DL    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
9751     185      AA    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
9753     535      UA    JFK  LAX  Los Angeles Intl  33.942536 -118.408075
9764     763      DL    JFK  LAX  Los Angeles Intl  33.942536 -118.408075

[486 rows x 7 columns]


In [None]:
# Combines the first 5 aircraft_df models and first 5 airline_df names into a single Series

planes_and_companies = pd.concat([planes_df['model'].head(5), airlines_df['name'].head(5)], ignore_index = True) 
print(planes_and_companies)

# UNION - smth.drop_duplicates() - removes duplicate records 

0                 EMB-145XR
1                  A320-214
2                  A320-214
3                  A320-214
4                 EMB-145LR
5         Endeavor Air Inc.
6    American Airlines Inc.
7      Alaska Airlines Inc.
8           JetBlue Airways
9      Delta Air Lines Inc.
dtype: object
