In [1]:
# Data source: https://www.kaggle.com/datasets/usdot/flight-delays?resource=download

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

# 0. Setup

### Airline Data

In [3]:
airline_df = pd.read_csv('../archive-2/airlines.csv')
airline_df

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [4]:
airline_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   IATA_CODE  14 non-null     object
 1   AIRLINE    14 non-null     object
dtypes: object(2)
memory usage: 352.0+ bytes


### Airport Data

In [6]:
airport_df = pd.read_csv('../archive-2/airports.csv')
airport_df.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [7]:
airport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   IATA_CODE  322 non-null    object 
 1   AIRPORT    322 non-null    object 
 2   CITY       322 non-null    object 
 3   STATE      322 non-null    object 
 4   COUNTRY    322 non-null    object 
 5   LATITUDE   319 non-null    float64
 6   LONGITUDE  319 non-null    float64
dtypes: float64(2), object(5)
memory usage: 17.7+ KB


### Flight Data

In [8]:
flights_df = pd.read_csv('../archive-2/flights.csv', low_memory=False)
flights_df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [9]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5819079 entries, 0 to 5819078
Data columns (total 31 columns):
 #   Column               Dtype  
---  ------               -----  
 0   YEAR                 int64  
 1   MONTH                int64  
 2   DAY                  int64  
 3   DAY_OF_WEEK          int64  
 4   AIRLINE              object 
 5   FLIGHT_NUMBER        int64  
 6   TAIL_NUMBER          object 
 7   ORIGIN_AIRPORT       object 
 8   DESTINATION_AIRPORT  object 
 9   SCHEDULED_DEPARTURE  int64  
 10  DEPARTURE_TIME       float64
 11  DEPARTURE_DELAY      float64
 12  TAXI_OUT             float64
 13  WHEELS_OFF           float64
 14  SCHEDULED_TIME       float64
 15  ELAPSED_TIME         float64
 16  AIR_TIME             float64
 17  DISTANCE             int64  
 18  WHEELS_ON            float64
 19  TAXI_IN              float64
 20  SCHEDULED_ARRIVAL    int64  
 21  ARRIVAL_TIME         float64
 22  ARRIVAL_DELAY        float64
 23  DIVERTED             int64  
 24

In [10]:
flights_df.isna().sum()

YEAR                         0
MONTH                        0
DAY                          0
DAY_OF_WEEK                  0
AIRLINE                      0
FLIGHT_NUMBER                0
TAIL_NUMBER              14721
ORIGIN_AIRPORT               0
DESTINATION_AIRPORT          0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME           86153
DEPARTURE_DELAY          86153
TAXI_OUT                 89047
WHEELS_OFF               89047
SCHEDULED_TIME               6
ELAPSED_TIME            105071
AIR_TIME                105071
DISTANCE                     0
WHEELS_ON                92513
TAXI_IN                  92513
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME             92513
ARRIVAL_DELAY           105071
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON    5729195
AIR_SYSTEM_DELAY       4755640
SECURITY_DELAY         4755640
AIRLINE_DELAY          4755640
LATE_AIRCRAFT_DELAY    4755640
WEATHER_DELAY          4755640
dtype: int64

#### Remove Rows with Missing Scheduled Times

In [11]:
flights_df_clean = flights_df[flights_df['SCHEDULED_TIME'].notna()]

# 1. Clean Tables

### 1.1 Remove Unwanted Columns

In [12]:
flights_df_2 = flights_df_clean[['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'ORIGIN_AIRPORT',
                                 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY',
                                 'SCHEDULED_TIME', 'ELAPSED_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL',
                                 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY',
                                 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']]
flights_df_2 = flights_df_2.copy()
flights_df_2.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,...,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,ANC,SEA,5,-11.0,...,430,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,LAX,PBI,10,-8.0,...,750,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,SFO,CLT,20,-2.0,...,806,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,LAX,MIA,20,-5.0,...,805,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,SEA,ANC,25,-1.0,...,320,-21.0,0,0,,,,,,


### 1.2 Clean Date

In [13]:
# source: https://cmdlinetips.com/2021/02/combine-year-month-and-day-columns-to-single-date-in-pandas/
cols=["YEAR","MONTH","DAY"]
flights_df_2['DATE'] = flights_df_2[cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")
flights_df_2['DATE']=pd.to_datetime(flights_df_2['DATE'])
flights_df_2.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,...,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE
0,2015,1,1,4,AS,98,ANC,SEA,5,-11.0,...,-22.0,0,0,,,,,,,2015-01-01
1,2015,1,1,4,AA,2336,LAX,PBI,10,-8.0,...,-9.0,0,0,,,,,,,2015-01-01
2,2015,1,1,4,US,840,SFO,CLT,20,-2.0,...,5.0,0,0,,,,,,,2015-01-01
3,2015,1,1,4,AA,258,LAX,MIA,20,-5.0,...,-9.0,0,0,,,,,,,2015-01-01
4,2015,1,1,4,AS,135,SEA,ANC,25,-1.0,...,-21.0,0,0,,,,,,,2015-01-01


In [14]:
flights_df_3 = flights_df_2.drop(['YEAR', 'MONTH', 'DAY'], axis=1)
flights_df_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5819073 entries, 0 to 5819078
Data columns (total 21 columns):
 #   Column               Dtype         
---  ------               -----         
 0   DAY_OF_WEEK          int64         
 1   AIRLINE              object        
 2   FLIGHT_NUMBER        int64         
 3   ORIGIN_AIRPORT       object        
 4   DESTINATION_AIRPORT  object        
 5   SCHEDULED_DEPARTURE  int64         
 6   DEPARTURE_DELAY      float64       
 7   SCHEDULED_TIME       float64       
 8   ELAPSED_TIME         float64       
 9   DISTANCE             int64         
 10  SCHEDULED_ARRIVAL    int64         
 11  ARRIVAL_DELAY        float64       
 12  DIVERTED             int64         
 13  CANCELLED            int64         
 14  CANCELLATION_REASON  object        
 15  AIR_SYSTEM_DELAY     float64       
 16  SECURITY_DELAY       float64       
 17  AIRLINE_DELAY        float64       
 18  LATE_AIRCRAFT_DELAY  float64       
 19  WEATHER_DELAY        

### 1.3 Add Day of Week

In [15]:
flights_df_3['DAY_OF_WEEK'].unique()

array([4, 5, 6, 7, 1, 2, 3])

In [16]:
flights_df_3['DAY_OF_WEEK'] = np.where(flights_df_3['DAY_OF_WEEK']==1, 'Monday',
                              np.where(flights_df_3['DAY_OF_WEEK']==2, 'Tuesday',
                              np.where(flights_df_3['DAY_OF_WEEK']==3, 'Wednesday',
                              np.where(flights_df_3['DAY_OF_WEEK']==4, 'Thursday',
                              np.where(flights_df_3['DAY_OF_WEEK']==5, 'Friday',
                              np.where(flights_df_3['DAY_OF_WEEK']==6, 'Saturday',
                              np.where(flights_df_3['DAY_OF_WEEK']==7, 'Sunday', 'Missing')))))))
flights_df_3.head()

Unnamed: 0,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,ELAPSED_TIME,DISTANCE,...,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE
0,Thursday,AS,98,ANC,SEA,5,-11.0,205.0,194.0,1448,...,-22.0,0,0,,,,,,,2015-01-01
1,Thursday,AA,2336,LAX,PBI,10,-8.0,280.0,279.0,2330,...,-9.0,0,0,,,,,,,2015-01-01
2,Thursday,US,840,SFO,CLT,20,-2.0,286.0,293.0,2296,...,5.0,0,0,,,,,,,2015-01-01
3,Thursday,AA,258,LAX,MIA,20,-5.0,285.0,281.0,2342,...,-9.0,0,0,,,,,,,2015-01-01
4,Thursday,AS,135,SEA,ANC,25,-1.0,235.0,215.0,1448,...,-21.0,0,0,,,,,,,2015-01-01


### 1.4 Calculate Actual Delay Time

In [17]:
flights_df_3['DELAYED_TIME'] = flights_df_3['SCHEDULED_TIME'] - flights_df_3['ELAPSED_TIME']
flights_df_3.head()

Unnamed: 0,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,ELAPSED_TIME,DISTANCE,...,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,DELAYED_TIME
0,Thursday,AS,98,ANC,SEA,5,-11.0,205.0,194.0,1448,...,0,0,,,,,,,2015-01-01,11.0
1,Thursday,AA,2336,LAX,PBI,10,-8.0,280.0,279.0,2330,...,0,0,,,,,,,2015-01-01,1.0
2,Thursday,US,840,SFO,CLT,20,-2.0,286.0,293.0,2296,...,0,0,,,,,,,2015-01-01,-7.0
3,Thursday,AA,258,LAX,MIA,20,-5.0,285.0,281.0,2342,...,0,0,,,,,,,2015-01-01,4.0
4,Thursday,AS,135,SEA,ANC,25,-1.0,235.0,215.0,1448,...,0,0,,,,,,,2015-01-01,20.0


### 1.5 Binary Delay Variable

In [18]:
# we are considering it a delay if the delay time is > 30 mins

In [19]:
flights_df_3['DELAYED'] = np.where(flights_df_3['DELAYED_TIME']< -30.0, 1, 0)
flights_df_3.head()

Unnamed: 0,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,ELAPSED_TIME,DISTANCE,...,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,DELAYED_TIME,DELAYED
0,Thursday,AS,98,ANC,SEA,5,-11.0,205.0,194.0,1448,...,0,,,,,,,2015-01-01,11.0,0
1,Thursday,AA,2336,LAX,PBI,10,-8.0,280.0,279.0,2330,...,0,,,,,,,2015-01-01,1.0,0
2,Thursday,US,840,SFO,CLT,20,-2.0,286.0,293.0,2296,...,0,,,,,,,2015-01-01,-7.0,0
3,Thursday,AA,258,LAX,MIA,20,-5.0,285.0,281.0,2342,...,0,,,,,,,2015-01-01,4.0,0
4,Thursday,AS,135,SEA,ANC,25,-1.0,235.0,215.0,1448,...,0,,,,,,,2015-01-01,20.0,0


In [20]:
flights_df_3['DELAYED'].value_counts()

0    5734441
1      84632
Name: DELAYED, dtype: int64

### 1.6 Convert to Military Time

#### Convert Times to Integers Rather than floats

In [21]:
flights_df_3 = flights_df_3.astype({'SCHEDULED_DEPARTURE': int}, errors='raise')
flights_df_3 = flights_df_3.astype({'SCHEDULED_ARRIVAL': int}, errors='raise')
flights_df_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5819073 entries, 0 to 5819078
Data columns (total 23 columns):
 #   Column               Dtype         
---  ------               -----         
 0   DAY_OF_WEEK          object        
 1   AIRLINE              object        
 2   FLIGHT_NUMBER        int64         
 3   ORIGIN_AIRPORT       object        
 4   DESTINATION_AIRPORT  object        
 5   SCHEDULED_DEPARTURE  int64         
 6   DEPARTURE_DELAY      float64       
 7   SCHEDULED_TIME       float64       
 8   ELAPSED_TIME         float64       
 9   DISTANCE             int64         
 10  SCHEDULED_ARRIVAL    int64         
 11  ARRIVAL_DELAY        float64       
 12  DIVERTED             int64         
 13  CANCELLED            int64         
 14  CANCELLATION_REASON  object        
 15  AIR_SYSTEM_DELAY     float64       
 16  SECURITY_DELAY       float64       
 17  AIRLINE_DELAY        float64       
 18  LATE_AIRCRAFT_DELAY  float64       
 19  WEATHER_DELAY        

In [22]:
flights_df_3['SCHEDULED_DEPARTURE'].max()

2359

In [23]:
flights_df_3['SCHEDULED_DEPARTURE'].min()

1

#### Convert to String and add leading 0's where missing

In [24]:
flights_df_3['SCHEDULED_DEPARTURE'] = flights_df_3['SCHEDULED_DEPARTURE'].astype(str).str.zfill(4)
flights_df_3['SCHEDULED_ARRIVAL'] = flights_df_3['SCHEDULED_ARRIVAL'].astype(str).str.zfill(4)
flights_df_3.head()

Unnamed: 0,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_TIME,ELAPSED_TIME,DISTANCE,...,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,DELAYED_TIME,DELAYED
0,Thursday,AS,98,ANC,SEA,5,-11.0,205.0,194.0,1448,...,0,,,,,,,2015-01-01,11.0,0
1,Thursday,AA,2336,LAX,PBI,10,-8.0,280.0,279.0,2330,...,0,,,,,,,2015-01-01,1.0,0
2,Thursday,US,840,SFO,CLT,20,-2.0,286.0,293.0,2296,...,0,,,,,,,2015-01-01,-7.0,0
3,Thursday,AA,258,LAX,MIA,20,-5.0,285.0,281.0,2342,...,0,,,,,,,2015-01-01,4.0,0
4,Thursday,AS,135,SEA,ANC,25,-1.0,235.0,215.0,1448,...,0,,,,,,,2015-01-01,20.0,0


In [25]:
print(flights_df_3.shape)

(5819073, 23)


# 2. Add Airline and Flight Data

### 2.1 Add Airline Data

In [26]:
flights_df_full = pd.merge(flights_df_3, airline_df, left_on='AIRLINE', right_on='IATA_CODE')
flights_df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5819073 entries, 0 to 5819072
Data columns (total 25 columns):
 #   Column               Dtype         
---  ------               -----         
 0   DAY_OF_WEEK          object        
 1   AIRLINE_x            object        
 2   FLIGHT_NUMBER        int64         
 3   ORIGIN_AIRPORT       object        
 4   DESTINATION_AIRPORT  object        
 5   SCHEDULED_DEPARTURE  object        
 6   DEPARTURE_DELAY      float64       
 7   SCHEDULED_TIME       float64       
 8   ELAPSED_TIME         float64       
 9   DISTANCE             int64         
 10  SCHEDULED_ARRIVAL    object        
 11  ARRIVAL_DELAY        float64       
 12  DIVERTED             int64         
 13  CANCELLED            int64         
 14  CANCELLATION_REASON  object        
 15  AIR_SYSTEM_DELAY     float64       
 16  SECURITY_DELAY       float64       
 17  AIRLINE_DELAY        float64       
 18  LATE_AIRCRAFT_DELAY  float64       
 19  WEATHER_DELAY        

In [27]:
flights_df_full2 = flights_df_full.drop(['AIRLINE_x', 'IATA_CODE'], axis=1)

In [28]:
flights_df_full2 = flights_df_full2.rename(columns={"AIRLINE_y":"AIRLINE"})

In [29]:
print(flights_df_full2.shape)

(5819073, 23)


### 2.2 Add Airport Data

#### Origin Airport

In [30]:
flights_df_full3 = pd.merge(flights_df_full2, airport_df, left_on='ORIGIN_AIRPORT', right_on='IATA_CODE')
flights_df_full3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5332908 entries, 0 to 5332907
Data columns (total 30 columns):
 #   Column               Dtype         
---  ------               -----         
 0   DAY_OF_WEEK          object        
 1   FLIGHT_NUMBER        int64         
 2   ORIGIN_AIRPORT       object        
 3   DESTINATION_AIRPORT  object        
 4   SCHEDULED_DEPARTURE  object        
 5   DEPARTURE_DELAY      float64       
 6   SCHEDULED_TIME       float64       
 7   ELAPSED_TIME         float64       
 8   DISTANCE             int64         
 9   SCHEDULED_ARRIVAL    object        
 10  ARRIVAL_DELAY        float64       
 11  DIVERTED             int64         
 12  CANCELLED            int64         
 13  CANCELLATION_REASON  object        
 14  AIR_SYSTEM_DELAY     float64       
 15  SECURITY_DELAY       float64       
 16  AIRLINE_DELAY        float64       
 17  LATE_AIRCRAFT_DELAY  float64       
 18  WEATHER_DELAY        float64       
 19  DATE                 

In [31]:
flights_df_full4 = flights_df_full3.drop(['IATA_CODE', 'CITY', 'COUNTRY', 'LATITUDE', 'LONGITUDE',
                                          'ORIGIN_AIRPORT'], axis=1)

In [32]:
flights_df_full4 = flights_df_full4.rename(columns={"AIRPORT":"ORIGIN_AIRPORT"})
flights_df_full4 = flights_df_full4.rename(columns={"STATE":"ORIGIN_STATE"})

In [33]:
print(flights_df_full4.shape)

(5332908, 24)


#### Destination Airport

In [34]:
flights_df_full5 = pd.merge(flights_df_full4, airport_df, left_on='DESTINATION_AIRPORT', right_on='IATA_CODE')
flights_df_full5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5332908 entries, 0 to 5332907
Data columns (total 31 columns):
 #   Column               Dtype         
---  ------               -----         
 0   DAY_OF_WEEK          object        
 1   FLIGHT_NUMBER        int64         
 2   DESTINATION_AIRPORT  object        
 3   SCHEDULED_DEPARTURE  object        
 4   DEPARTURE_DELAY      float64       
 5   SCHEDULED_TIME       float64       
 6   ELAPSED_TIME         float64       
 7   DISTANCE             int64         
 8   SCHEDULED_ARRIVAL    object        
 9   ARRIVAL_DELAY        float64       
 10  DIVERTED             int64         
 11  CANCELLED            int64         
 12  CANCELLATION_REASON  object        
 13  AIR_SYSTEM_DELAY     float64       
 14  SECURITY_DELAY       float64       
 15  AIRLINE_DELAY        float64       
 16  LATE_AIRCRAFT_DELAY  float64       
 17  WEATHER_DELAY        float64       
 18  DATE                 datetime64[ns]
 19  DELAYED_TIME         

In [35]:
flights_df_full6 = flights_df_full5.drop(['IATA_CODE', 'CITY', 'COUNTRY', 'LATITUDE', 'LONGITUDE',
                                          'DESTINATION_AIRPORT'], axis=1)

In [36]:
flights_df_full6 = flights_df_full6.rename(columns={"AIRPORT":"DESTINATION_AIRPORT"})
flights_df_full6 = flights_df_full6.rename(columns={"STATE":"DESTINATION_STATE"})

In [37]:
print(flights_df_full6.shape)

(5332908, 25)


In [38]:
flights_df_full6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5332908 entries, 0 to 5332907
Data columns (total 25 columns):
 #   Column               Dtype         
---  ------               -----         
 0   DAY_OF_WEEK          object        
 1   FLIGHT_NUMBER        int64         
 2   SCHEDULED_DEPARTURE  object        
 3   DEPARTURE_DELAY      float64       
 4   SCHEDULED_TIME       float64       
 5   ELAPSED_TIME         float64       
 6   DISTANCE             int64         
 7   SCHEDULED_ARRIVAL    object        
 8   ARRIVAL_DELAY        float64       
 9   DIVERTED             int64         
 10  CANCELLED            int64         
 11  CANCELLATION_REASON  object        
 12  AIR_SYSTEM_DELAY     float64       
 13  SECURITY_DELAY       float64       
 14  AIRLINE_DELAY        float64       
 15  LATE_AIRCRAFT_DELAY  float64       
 16  WEATHER_DELAY        float64       
 17  DATE                 datetime64[ns]
 18  DELAYED_TIME         float64       
 19  DELAYED              

# 3. Export Data to Parquet

### 3.1 Full

In [39]:
flights_df_full6.to_csv('Resources/flight_data_clean.csv')

### 3.2 Delayed/Cancelled

In [40]:
flights_df_full7 = flights_df_full6.loc[(flights_df_full6['DELAYED']==1)|(flights_df_full6['CANCELLED']==1)]
flights_df_full7.to_csv('Resources/delayed_cancelled_flight_data_clean.csv')

In [41]:
flights_df_full7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167986 entries, 2 to 5332894
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   DAY_OF_WEEK          167986 non-null  object        
 1   FLIGHT_NUMBER        167986 non-null  int64         
 2   SCHEDULED_DEPARTURE  167986 non-null  object        
 3   DEPARTURE_DELAY      84172 non-null   float64       
 4   SCHEDULED_TIME       167986 non-null  float64       
 5   ELAPSED_TIME         80561 non-null   float64       
 6   DISTANCE             167986 non-null  int64         
 7   SCHEDULED_ARRIVAL    167986 non-null  object        
 8   ARRIVAL_DELAY        80561 non-null   float64       
 9   DIVERTED             167986 non-null  int64         
 10  CANCELLED            167986 non-null  int64         
 11  CANCELLATION_REASON  87425 non-null   object        
 12  AIR_SYSTEM_DELAY     80503 non-null   float64       
 13  SECURITY_DELA