In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Loading flights data per month from csv into dataframe
# Data from https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236
# Data provided in per month blocks, selected month blocks were filtered by 
# 'Washington' i.e. all flights that arrived or departed from airports that
# were in the database that were in Washington. Nov. 2018 is the latest flight
# data released.

dec_2017_df = pd.read_csv('data-raw/dec_2017_T_ONTIME_REPORTING.csv', index_col=False)
jan_2018_df = pd.read_csv('data-raw/jan_2018_T_ONTIME_REPORTING.csv', index_col=False)
feb_2018_df = pd.read_csv('data-raw/feb_2018_T_ONTIME_REPORTING.csv', index_col=False)
mar_2018_df = pd.read_csv('data-raw/mar_2018_T_ONTIME_REPORTING.csv', index_col=False)
apr_2018_df = pd.read_csv('data-raw/apr_2018_T_ONTIME_REPORTING.csv', index_col=False)
may_2018_df = pd.read_csv('data-raw/may_2018_T_ONTIME_REPORTING.csv', index_col=False)
jun_2018_df = pd.read_csv('data-raw/jun_2018_T_ONTIME_REPORTING.csv', index_col=False)
jul_2018_df = pd.read_csv('data-raw/jul_2018_T_ONTIME_REPORTING.csv', index_col=False)
aug_2018_df = pd.read_csv('data-raw/aug_2018_T_ONTIME_REPORTING.csv', index_col=False)
sep_2018_df = pd.read_csv('data-raw/sep_2018_T_ONTIME_REPORTING.csv', index_col=False)
oct_2018_df = pd.read_csv('data-raw/oct_2018_T_ONTIME_REPORTING.csv', index_col=False)
nov_2018_df = pd.read_csv('data-raw/nov_2018_T_ONTIME_REPORTING.csv', index_col=False)

In [13]:
# Combining separate month dataframes into one year long data frame
flights = pd.concat([dec_2017_df, jan_2018_df, feb_2018_df, mar_2018_df, 
                     apr_2018_df, may_2018_df, jun_2018_df, jul_2018_df,
                     aug_2018_df, sep_2018_df, oct_2018_df, nov_2018_df])

In [31]:
# Load in weather data from https://www.ncdc.noaa.gov/cdo-web/datatools/lcd
weather_2018 = pd.read_csv('data-raw/weather_2018.csv', index_col=False) # data for all of 2018
weather_dec_2017 = pd.read_csv('data-raw/weather_dec_2017.csv', index_col=False) # data for December 2017

# Combine into one dataframe for weather for Dec 2017 - November 2018
weather = pd.concat([weather_dec_2017, weather_2018])

In [24]:
# Check column data types
print(flights.dtypes)

# I notice a column that wasn't described as in the dataset on the BTS website so I drop it
flights = flights.drop('Unnamed: 31', axis = 1)

# Double check it's dropped
print(flights.columns)

YEAR                       int64
QUARTER                    int64
MONTH                      int64
DAY_OF_MONTH               int64
DAY_OF_WEEK                int64
FL_DATE                   object
OP_UNIQUE_CARRIER         object
ORIGIN_AIRPORT_ID          int64
ORIGIN_AIRPORT_SEQ_ID      int64
ORIGIN_CITY_MARKET_ID      int64
ORIGIN                    object
ORIGIN_CITY_NAME          object
DEST_AIRPORT_ID            int64
DEST_AIRPORT_SEQ_ID        int64
DEST_CITY_MARKET_ID        int64
DEST                      object
DEST_CITY_NAME            object
CRS_DEP_TIME               int64
DEP_TIME                 float64
DEP_DELAY                float64
DEP_DELAY_NEW            float64
DEP_DEL15                float64
CANCELLED                float64
DIVERTED                 float64
AIR_TIME                 float64
DISTANCE                 float64
CARRIER_DELAY            float64
WEATHER_DELAY            float64
NAS_DELAY                float64
SECURITY_DELAY           float64
LATE_AIRCR

In [26]:
flights.DEP_TIME

0        2116.0
1        2154.0
2        2043.0
3        2053.0
4        2052.0
5        2103.0
6        2051.0
7        2043.0
8        2102.0
9        2036.0
10       2044.0
11       2101.0
12       2054.0
13       2056.0
14       2117.0
15       2224.0
16       2219.0
17       2224.0
18       2225.0
19       2239.0
20       2229.0
21       2232.0
22       2221.0
23       2226.0
24       2225.0
25       2228.0
26       2228.0
27       2227.0
28       2227.0
29       2225.0
          ...  
23469    1701.0
23470     725.0
23471    1906.0
23472     636.0
23473    1644.0
23474     657.0
23475     934.0
23476     820.0
23477     839.0
23478    1244.0
23479    2240.0
23480    1726.0
23481    1724.0
23482     746.0
23483     730.0
23484    1419.0
23485    1037.0
23486    2021.0
23487    1252.0
23488    1258.0
23489    1917.0
23490    1108.0
23491    1939.0
23492    1741.0
23493    1917.0
23494    1822.0
23495    1838.0
23496    2356.0
23497    2108.0
23498     902.0
Name: DEP_TIME, Length: 