In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict


class Ohare(object):
	"""
	faa = OrdUtils()

	faa.get_ob_delays()

	faa.get_ib_delays()

	faa.get_baselines()

	faa.get_mini_ob_flights()
	"""
	
	regions = {'SE': ['ATL', 'DFW'],
	           'SW': ['LAX', 'LAS'],
	           'NW': ['SFO', 'DEN'],
	           'NE': ['JFK', 'EWR']
	           }
	
	def __init__(self):
		
		self.ord_flights = pd.read_csv('../data/ohare/01_2018_ORD_AIR_TFFX.csv', low_memory=False)
		
		# Split the data between inbound and outbound flights
		mask_from_ord = self.ord_flights.Origin == 'ORD'
		mask_to_ord = self.ord_flights.Origin != 'ORD'
		self.flights_from_ohare = self.ord_flights[mask_from_ord]
		self.flights_to_ohare = self.ord_flights[mask_to_ord]
		
		self.baselines = self.get_baselines()
	
	def get_baselines(self):
		ob_del = self.flights_from_ohare.DepDelayMinutes.mean()
		ib_del = self.flights_to_ohare.DepDelayMinutes.mean()
		
		ob_taxi_del = self.flights_from_ohare.TaxiOut.max()
		ib_taxi_del = self.flights_to_ohare.TaxiOut.max()
		
		# ALL OHARE TRAFFIC
		return {
			
			'ob_del':     ob_del,
			'ib_del':     ib_del,
			'ob_taxi_tm': ob_taxi_del,
			'ib_taxi_tm': ib_taxi_del
		}
	
	def get_ports(self, port, column_name, df):
		"""Filters a DF on a given airport. Convenience Method"""
		mask = (df[column_name] == port)
		return df[mask]
	
	def get_ib_delays(self, as_df=True):
		"""For flights that are arriving to O'hare, how long are the delays?"""
		ohare_ib_delays = defaultdict(float)
		
		for reg, airports in self.regions.items():
			
			for port in airports:
				ohare_ib_delays[port] = self.get_ports(port, 'Origin', self.flights_to_ohare).DepDelayMinutes.mean()
		
		ohare_ib_delays['ORD_AVG'] = self.baselines['ib_del']
		
		if not as_df:
			return ohare_ib_delays
		else:
			return pd.DataFrame(ohare_ib_delays, index=[0])
	
	def get_ob_delays(self, as_df=True):
		"""For flights that are leaving from O'hare, how long are the delays?"""
		
		ohare_outb_delays = defaultdict(float)
		
		for reg, airports in self.regions.items():
			
			for port in airports:
				ohare_outb_delays[port] = self.get_ports(port, 'Dest', self.flights_from_ohare).DepDelayMinutes.mean()
		
		ohare_outb_delays['ORD_AVG'] = self.baselines['ob_del']
		
		if not as_df:
			return ohare_outb_delays
		else:
			return pd.DataFrame(ohare_outb_delays, index=[0])
	
	def get_mini_ob_flights(self):
		return self.flights_from_ohare[['DepDelay', 'Reporting_Airline', 'Dest', 'ArrDelay', 'Quarter', 'DayOfWeek']]


In [4]:
faa = Ohare()

In [6]:
faa.get_mini_ob_flights().head()

Unnamed: 0,DepDelay,Reporting_Airline,Dest,ArrDelay,Quarter,DayOfWeek
10,-3.0,UA,SFO,-5.0,1,6
13,-2.0,UA,BWI,-12.0,1,6
15,-6.0,UA,CVG,-16.0,1,6
19,5.0,UA,DEN,-12.0,1,6
23,-2.0,UA,TPA,-12.0,1,6


In [8]:
ob_flights = faa.get_mini_ob_flights()

ob_flights = ob_flights.reset_index()

ob_flights = ob_flights.drop(labels='index', axis=1)

ob_flights_dum_al = ob_flights.merge(pd.get_dummies(ob_flights['Reporting_Airline']), left_index=True, right_index=True)

ob_flts_all_dms = ob_flights_dum_al.merge(pd.get_dummies(ob_flights_dum_al['Dest']), left_index=True, right_index=True)

In [22]:
ob_flts_all_dms = ob_flts_all_dms.dropna()

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
y = ob_flts_all_dms['DepDelay']

In [26]:
X = ob_flts_all_dms.loc[:,'ArrDelay':]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=51)

In [31]:
average_delay_trn = np.mean(y_train)
np.mean(y_train)

13.145062962222667

In [40]:
np.mean(y_train[X_train.DayOfWeek == 1])

23.54456951627624

In [42]:
for i in range (1, 8):
    print (f'The average delay for day of the week: {i}, is {np.mean(y_train[X_train.DayOfWeek == i])}.')


The average delay for day of the week: 1, is 23.54456951627624.
The average delay for day of the week: 2, is 12.936670687575392.
The average delay for day of the week: 3, is 10.183932346723044.
The average delay for day of the week: 4, is 6.304249717939075.
The average delay for day of the week: 5, is 13.513365067740754.
The average delay for day of the week: 6, is 7.469704086425552.
The average delay for day of the week: 7, is 15.30825901512214.
