# Preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
flights_sample = pd.read_csv("../../data/processed/flights_sample.csv", index_col=None)

### Some Feature Engineering:

# Change the column dtypes to the correct type for the date columns
flights_sample['Scheduled Departure Time (local time)'] = pd.to_datetime(flights_sample['Scheduled Departure Time (local time)'])
flights_sample['Actual Departure Time (local time)'] = pd.to_datetime(flights_sample['Actual Departure Time (local time)'])
flights_sample['Wheels Off (local time)'] = pd.to_datetime(flights_sample['Wheels Off (local time)'])
flights_sample['Wheels On (local time)'] = pd.to_datetime(flights_sample['Wheels On (local time)'])
flights_sample['Scheduled Arrival Time (local time)'] = pd.to_datetime(flights_sample['Scheduled Arrival Time (local time)'])
flights_sample['Actual Arrival Time (local time)'] = pd.to_datetime(flights_sample['Actual Arrival Time (local time)'])

#Create a new column for the hour of the day for actual departure time and for wheels on time
flights_sample['Actual Departure Hour'] = flights_sample['Actual Departure Time (local time)'].dt.hour  #I don't like that they are FLOATS.. would prefer int but having an error code because of NANs
flights_sample['Wheels On Hour'] = flights_sample['Wheels On (local time)'].dt.hour #I don't like that they are FLOATS.. would prefer int but having an error code because of NANs

# Create a new columns that calculates the difference between the departure delay and arrival delay
flights_sample['Difference in Delay (Dep - Arr [minutes])'] = flights_sample['Departure Delay (minutes)'] - flights_sample['Arrival Delay (minutes)']

#Create departure and arrival state column
flights_sample['Departure State'] = flights_sample['Origin Airport (City, State)'].str[-2:]
flights_sample['Arrival State'] = flights_sample['Destination Airport (City, State)'].str[-2:]

In [3]:
flights_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199350 entries, 0 to 199349
Data columns (total 42 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   Flight Year                                 199350 non-null  int64         
 1   Flight Month                                199350 non-null  int64         
 2   Flight Day                                  199350 non-null  int64         
 3   Flight Weekday                              199350 non-null  int64         
 4   Marketer - Unique Carrier Code              199350 non-null  object        
 5   Operator - Unique Carrier Code              199350 non-null  object        
 6   Different Marketer & Operator Carrier Code  199350 non-null  int64         
 7   Tail Number                                 199350 non-null  object        
 8   Flight Number                               199350 non-null  int64        

# Orientation

In [17]:
# Get a list of all Airport codes
departure_airports = flights_sample['Origin Airport (IATA Code)'].unique()
arrival_airports = flights_sample['Destination Airport (IATA Code)'].unique()
airport_codes = np.concatenate((departure_airports, arrival_airports))
#airport_codes.size
airport_codes

array(['LAS', 'SYR', 'ELP', 'DEN', 'ATL', 'TRI', 'SMF', 'EWR', 'CLT',
       'EUG', 'AUS', 'DCA', 'GRR', 'STL', 'FAI', 'SEA', 'SJC', 'ORD',
       'BWI', 'LFT', 'HOU', 'BOS', 'LAX', 'SFB', 'DTW', 'JLN', 'JFK',
       'PHL', 'MEM', 'DFW', 'PHX', 'IAD', 'SDF', 'KOA', 'MCO', 'BTV',
       'ORF', 'ABQ', 'MKE', 'LGA', 'FLL', 'MDT', 'MIA', 'CLE', 'PWM',
       'SNA', 'HNL', 'HSV', 'IND', 'IAH', 'RNO', 'SLC', 'ROC', 'RSW',
       'SAT', 'CMI', 'MSP', 'LRD', 'PDX', 'SFO', 'RIC', 'MSY', 'MDW',
       'OMA', 'RDU', 'EWN', 'ECP', 'ACY', 'CAK', 'CMH', 'OAK', 'DRO',
       'CVG', 'GEG', 'LIT', 'BNA', 'CHA', 'BUF', 'ANC', 'JAX', 'COS',
       'SAN', 'HPN', 'XNA', 'ONT', 'SRQ', 'AGS', 'ROA', 'SAV', 'RDD',
       'BTR', 'TPA', 'PVD', 'PSP', 'MYR', 'BDL', 'MSN', 'BZN', 'JAN',
       'SAF', 'LWS', 'GRB', 'CAE', 'DAL', 'PBG', 'BUR', 'VPS', 'CHS',
       'PAH', 'FSD', 'LIH', 'ALW', 'OKC', 'OGG', 'GSP', 'COU', 'BOI',
       'SPS', 'BHM', 'GSO', 'MFE', 'ALB', 'ERI', 'IPT', 'LGB', 'GFK',
       'MSO', 'MGM',

Okay, so we need to look at 745x different airports we would need to get the weather from, on a daily basis for 2 years and 7x days

In [16]:
(745 * ((2 * 12) + 1)) / 500

37.25

If we're going by day. that's a total of ~550,000 API calls.. and this is just for the sample.. we could have more than 745 airports to look at.. 

World Weather API is only allowing 500 request a day, so if we can pull 2x yars per API call, we should be able to pull all airports in 2x days
- The local History API can only pull a month at a time.. that's a bummer

https://home.openweathermap.org/history_bulks/new

Allows to do complete history pulls for 10USD a pull.. wow.. 

https://rapidapi.com/iddogino/api/global-weather-history/pricing

This guy allows 10,000 pull a month

In [29]:
#World Weather API can pull a month at a time.. 500x calls a day.. 
500 / 25

20.0

If we can break down in roughly 20 values we could be good.. what about per states?

In [36]:
# Consolidate the States into a single list
departure_states = flights_sample['Departure State'].unique()
arrival_states = flights_sample['Arrival State'].unique()
states = np.concatenate((departure_states, arrival_states))
# remove the duplicates
states = np.unique(states)
states.shape

(53,)

Found this article that is scrapping Weather Underground Data using BeautifulSoup.
https://flowingdata.com/2007/07/09/grabbing-weather-underground-data-with-beautifulsoup/

Will use it as a foundation for ours

In [25]:
!pip install BeautifulSoup4



In [26]:
# Let's use BeautifulSoup to parse Weather Underground's HTML

def getWeather(airport):
    from urllib.request import urlopen
    from bs4 import BeautifulSoup

    # set a the value that will store the daily weather
    daily_weather = pd.DataFrame()

    #Iterate through each days between 1 Jan 2018 and 31 Jan 2020
    for year in range(2018,2020):
        for m in range(1,13):
            for d in range(1,32):

                #Check if leap year
                if year%4 == 0:
                    leap = True
                elif year%100 == 0:
                    leap = False
                elif year%400 == 0:
                    leap = True
                else:
                    leap = False

                #Check if already already gone through the month
                if (m == 2 and leap and d>29):
                    continue
                elif (m == 2 and not leap and d>28):
                    continue
                elif (m in [4,6,9,11] and d>30): #original had 10 instead of 11, but 11 is correct (november has 30 days, not october)
                    continue

                # Open the URL and read the HTML
                url = "https://www.wunderground.com/history/daily/us/ny/new-york-city/K{}/date/{}-{}-{}".format(airport,year,m,d)
                page = urlopen(url).read()

                # Get temperature
                soup = BeautifulSoup(page)
                dayTemp = soup.body.nobr.b.string

                # Add to dataframe
                daily_weather = daily_weather.append({'Date': "{}-{}-{}".format(year,m,d), 'Temp': dayTemp, 'Airport': airport}, ignore_index=True)

In [27]:
getWeather('RDU')

AttributeError: 'NoneType' object has no attribute 'b'