Data wrangling for the weather-airports project.

Part 2: Merging the data. Requires data from Parts 1a and 1b to run.

In [1]:
import numpy as np
import pandas as pd
import os
import sys
sys.path.append('D:/Springboard_DataSci/Assignments/Lib')
import TimeTracker
from varname import nameof

In [2]:
save_data = True; save_data = False
START_TIME_LOCAL = 'StartTimeLocal'
END_TIME_LOCAL = 'EndTimeLocal'
START_OR_END = 'StartOrEnd'

pd.options.mode.chained_assignment = None #turns off warnings for data replacements

In [3]:
'''Data collection'''
stopwatch = TimeTracker.TimeTracker()
# Get the weather data.
path = r'D:\Springboard_DataSci\Assignments\Capstone_2--Airport_weather\data'
os.chdir(path)

events_cold = pd.read_csv('events_cold.csv')
events_hail = pd.read_csv('events_hail.csv')
events_sleet = pd.read_csv('events_sleet.csv')
events_wind = pd.read_csv('events_wind.csv')
events_fog = pd.read_csv('events_fog.csv')
events_snow = pd.read_csv('events_snow.csv')
events_rain = pd.read_csv('events_rain.csv')

In [4]:
# Verifies that StartOrEnd codes alternate Start, End, Start, End,...
def verifyStartEnd(df):
    return all(df[START_OR_END][::2] == START_TIME_LOCAL)\
        and all(df[START_OR_END][1::2] == END_TIME_LOCAL)

for event in [events_cold, events_hail, events_sleet, events_wind, events_fog, events_snow, events_rain]:
    print('Events are balanced?', verifyStartEnd(event))

Events are balanced? True
Events are balanced? True
Events are balanced? True
Events are balanced? True
Events are balanced? True
Events are balanced? False
Events are balanced? True


In [5]:
# All codes but the snow are balanced.
events_snow['Balance'] = False
events_snow.Balance[::2] = (events_snow[START_OR_END][::2] == START_TIME_LOCAL)
events_snow.Balance[1::2] = (events_snow[START_OR_END][1::2] == END_TIME_LOCAL)
unbalanced_rows = events_snow[events_snow.Balance == False].index
print(events_snow.iloc[unbalanced_rows]) #Exact same time and place. Just drop them.
events_snow.drop(unbalanced_rows, inplace=True)
print('Snow codes now balanced?', verifyStartEnd(events_snow)) #Balanced now. Let's proceed.
events_snow = events_snow.drop('Balance', axis=1).reset_index(drop=True)

   Airport  SnowCode      StartOrEnd                       Time  Balance
66     JFK       1.0    EndTimeLocal  2016-03-21 09:51:00-04:00    False
67     JFK       1.0  StartTimeLocal  2016-03-21 09:51:01-04:00    False
Snow codes now balanced? True


In [6]:
# Get each event's starting times and ending times back on the same row.
def pivot_on_times(df):
    codename = df.columns[1]
    df = pd.concat([pd.Series(np.repeat(np.arange(len(df)//2), 2), name='NewIndex'), df], axis=1)
    df = df.pivot(index='NewIndex', columns=START_OR_END)
    df.columns = df.columns.droplevel()
    df = df.iloc[:, [0,2,5,4]]
    df.columns = ['Airport', codename, START_TIME_LOCAL, END_TIME_LOCAL]
    return df

In [7]:
events_cold, events_hail, events_sleet, events_wind, events_fog, events_snow, events_rain = (
    pivot_on_times(df) for df in (
        events_cold, events_hail, events_sleet, events_wind, events_fog, events_snow, events_rain))

In [8]:
# Get the flight data.
flights = pd.read_csv('flights.csv')

Assign weather codes to each flight, both for departure and arrival. Codes for departure
match to weather at the departure city, and codes for arrival match to weather at the arrival
city.

In [9]:
# This order must correspond to the "for event_type" list below.
flights['DepCold'] = flights['DepHail'] = flights['DepSleet'] = flights['DepWind'] = flights['DepFog']\
    = flights['DepSnow'] = flights['DepRain'] = flights['ArrCold'] = flights['ArrHail'] = flights['ArrSleet']\
    = flights['ArrWind'] = flights['ArrFog'] = flights['ArrSnow'] = flights['ArrRain'] = 0

Loop through the entire list of flights and assign weather codes. We do this first for the departures,
then the arrivals. Begin by sorting by departure airport and time (but not arrival airport--we are just looking
at the departure side for now).

In [10]:
airports = list(pd.unique(flights.ORIGIN))
event_types = [events_cold, events_hail, events_sleet, events_wind, events_fog, events_snow, events_rain]
event_type_names = [name for name in nameof(
    events_cold, events_hail, events_sleet, events_wind, events_fog, events_snow, events_rain)]
assert (set(airports) == set(pd.unique(flights.DEST))), 'Departure and arrival airports are not all the same'

In [11]:
flights.sort_values(by=['ORIGIN','DepartureTime'], inplace=True, ignore_index=True)
codes_start_at = flights.columns.get_indexer_for(['DepCold'])[0] # 9
for i, event_type in enumerate(event_types):
    print('Entering departure data for', event_type_names[i]) # Rain codes will take awhile.
    for airport in airports:
        events_at_this_airport = event_type[event_type.Airport == airport].reset_index(drop=True)
        for j in events_at_this_airport.index:
            individual_event = pd.Series(events_at_this_airport.iloc[j])
            weather_indices = flights[(flights.DepartureTime >= individual_event.StartTimeLocal) & (
                flights.DepartureTime <= individual_event.EndTimeLocal) & (flights.ORIGIN == airport)].index
            flights.iloc[weather_indices.to_numpy(), i+codes_start_at] = individual_event.iloc[1]

Entering departure data for events_cold
Entering departure data for events_hail
Entering departure data for events_sleet
Entering departure data for events_wind
Entering departure data for events_fog
Entering departure data for events_snow
Entering departure data for events_rain


In [12]:
# Repeat with the arrivals.
flights.sort_values(by=['DEST','ArrivalTime'], inplace=True, ignore_index=True)
codes_start_at = flights.columns.get_indexer_for(['ArrCold'])[0] # 16
for i, event_type in enumerate(event_types):
    print('Entering arrival data for', event_type_names[i]) # Rain codes will take awhile.
    for airport in airports:
        events_at_this_airport = event_type[event_type.Airport == airport].reset_index(drop=True)
        for j in events_at_this_airport.index:
            individual_event = pd.Series(events_at_this_airport.iloc[j])      
            weather_indices = flights[(flights.ArrivalTime >= individual_event.StartTimeLocal) & (
                flights.ArrivalTime <= individual_event.EndTimeLocal) & (flights.DEST == airport)].index
            flights.iloc[weather_indices.to_numpy(), i+codes_start_at] = individual_event.iloc[1]

Entering arrival data for events_cold
Entering arrival data for events_hail
Entering arrival data for events_sleet
Entering arrival data for events_wind
Entering arrival data for events_fog
Entering arrival data for events_snow
Entering arrival data for events_rain


In [14]:
flights.rename(columns={"WEATHER_DELAY": "WeatherDelayLength"}, inplace=True)
if save_data:
    print('Saving data')
    flights.to_csv('flights_with_weather.csv', index=False)

print('Total runtime:', stopwatch.getElapsedTime())

Total runtime: --- 9.39 minutes ---
