In [1]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

________________________
# Load Data

### Cleaned, Merged dataset can be downloaded from here: https://www.kaggle.com/arwasheraky/cleaned-flight-delays-2015

In [2]:
df_flights = pd.read_csv('../../Data/flightsmerged.csv', low_memory=False)

In [3]:
df_flights.head()

Unnamed: 0,MONTH,DAY,FLIGHT_NUMBER,TAIL_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,...,ORIGIN_AC,ORIGIN_NAME,ORIGIN_STATE,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_AC,DESTINATION_NAME,DEST_STATE,DEST_LATITUDE,DEST_LONGITUDE
0,1,1,98,N407AS,00:05:00,23:54:00,-11.0,205.0,1448,04:30:00,...,ANC,Ted Stevens Anchorage International Airport,AK,61.17,-150.0,SEA,Seattle-Tacoma International Airport,WA,47.45,-122.31
1,1,1,2336,N3KUAA,00:10:00,00:02:00,-8.0,280.0,2330,07:50:00,...,LAX,Los Angeles International Airport,CA,33.94,-118.41,PBI,Palm Beach International Airport,FL,26.68,-80.1
2,1,1,840,N171US,00:20:00,00:18:00,-2.0,286.0,2296,08:06:00,...,SFO,San Francisco International Airport,CA,37.62,-122.37,CLT,Charlotte Douglas International Airport,NC,35.21,-80.94
3,1,1,258,N3HYAA,00:20:00,00:15:00,-5.0,285.0,2342,08:05:00,...,LAX,Los Angeles International Airport,CA,33.94,-118.41,MIA,Miami International Airport,FL,25.79,-80.29
4,1,1,135,N527AS,00:25:00,00:24:00,-1.0,235.0,1448,03:20:00,...,SEA,Seattle-Tacoma International Airport,WA,47.45,-122.31,ANC,Ted Stevens Anchorage International Airport,AK,61.17,-150.0


In [4]:
# load complementary data
# Source: https://data.world/mattwinter225/2015-usa-weather-avg-max-min

df_weather = pd.read_csv('../../Data/2015_USA_Weather_Data.csv', sep=";")
df_weather.head()

Unnamed: 0,STATION,STATION_NAME,LATITUDE,LONGITUDE,LATLONG,AvgTemp,MaxTemp,MinTemp,StateName,Zip,State,Date
0,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",55.0,58.0,47.0,Massachusetts,1602.0,MA,10/1/15 12:00 AM
1,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",47.0,49.0,44.0,Massachusetts,1602.0,MA,10/2/15 12:00 AM
2,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",45.0,49.0,42.0,Massachusetts,1602.0,MA,10/3/15 12:00 AM
3,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",47.0,53.0,41.0,Massachusetts,1602.0,MA,10/4/15 12:00 AM
4,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",49.0,59.0,44.0,Massachusetts,1602.0,MA,10/5/15 12:00 AM


____________
# Preparing Weather Data

In [6]:
df_weather.isnull().sum()

STATION            0
STATION_NAME       0
LATITUDE           0
LONGITUDE          0
LATLONG            0
AvgTemp            0
MaxTemp            0
MinTemp            0
StateName          0
Zip                0
State           1283
Date               0
dtype: int64

In [7]:
df_weather = df_weather.dropna()

## Remove Columns

In [8]:
df_weather = df_weather[['State','Date','AvgTemp','MaxTemp','MinTemp']]
df_weather.columns

Index(['State', 'Date', 'AvgTemp', 'MaxTemp', 'MinTemp'], dtype='object')

## Add Columns

In [9]:
# Extract the month from date

def extract_month(str_date):
    
    if type(str_date) != str:
        return str_date
    
    date_month = datetime.strptime(str_date, '%m/%d/%y %I:%M %p')
    date_month = date_month.month
    
    return date_month

In [10]:
df_weather['Month'] = df_weather['Date'].apply(extract_month)
df_weather['Month'].value_counts()

5     71397
7     71282
1     71009
3     71001
8     70957
10    70721
12    70380
4     68999
9     68654
11    68337
2     64141
6     52968
Name: Month, dtype: int64

In [11]:
# Group by State and month, to set a unified value

state_avg_temp = df_weather.groupby(by =['State','Month'])['AvgTemp'].mean()
state_min_temp = df_weather.groupby(by =['State','Month'])['MinTemp'].mean()
state_max_temp = df_weather.groupby(by =['State','Month'])['MaxTemp'].mean()

In [12]:
# Default value of new columns

df_weather['State_Avg_Temp'], df_weather['State_Min_Temp'], df_weather['State_Max_Temp'] = 0, 0, 0

In [13]:
unique_states = set(df_weather['State'])
len(unique_states)

50

In [14]:
for state in unique_states:
    
    for idx in df_weather.index[df_weather['State'] == state]:
        this_month = df_weather.loc[idx,'Month']
        
        df_weather.at[idx,'State_Avg_Temp'] = state_avg_temp.loc[(state,this_month)]
        df_weather.at[idx,'State_Min_Temp'] = state_min_temp.loc[(state,this_month)]
        df_weather.at[idx,'State_Max_Temp'] = state_max_temp.loc[(state,this_month)]


In [15]:
df_weather = df_weather[['State','Month','State_Avg_Temp','State_Min_Temp','State_Max_Temp']]
df_weather.head()

Unnamed: 0,State,Month,State_Avg_Temp,State_Min_Temp,State_Max_Temp
0,MA,10,52,45,60
1,MA,10,52,45,60
2,MA,10,52,45,60
3,MA,10,52,45,60
4,MA,10,52,45,60


## Remove Duplicated States

In [16]:
print("Length of DF = ", len(df_weather))
print("Length of Unique States = ", len(set(df_weather['State'])))

Length of DF =  819846
Length of Unique States =  50


In [17]:
# Remove duplicated state,month tupples.

new_df_weather = df_weather.drop_duplicates(subset = ["State",'Month'])
len(new_df_weather)

597

____________________
# Merging

In [18]:
# Keep just the delayed and on_time flights, remove the cancelled and early flights.

new_df = df_flights[(df_flights['CLASS'] == 'On_Time') | (df_flights['CLASS'] == 'Delayed')]
new_df = new_df.drop(columns = ['CANCELLATION_REASON','CANCELLED','DIVERTED'])
print("Original dataset : ",df_flights.shape)
print("Now : ",new_df.shape)

Original dataset :  (5697506, 34)
Now :  (5608236, 31)


In [19]:
df = pd.merge(new_df, new_df_weather, left_on=['ORIGIN_STATE','MONTH'], right_on=['State','Month'], how='left')
df.drop(['State','Month'], axis=1, inplace=True)
df.rename(columns={'State_Avg_Temp':'ORIGIN_AVG_TEMP',
                       'State_Min_Temp':'ORIGIN_MIN_TEMP',
                       'State_Max_Temp':'ORIGIN_MAX_TEMP'} , inplace=True)

df = pd.merge(df, new_df_weather, left_on=['DEST_STATE','MONTH'], right_on=['State','Month'], how='left')
df.drop(['State','Month'], axis=1, inplace=True)
df.rename(columns={'State_Avg_Temp':'DEST_AVG_TEMP',
                       'State_Min_Temp':'DEST_MIN_TEMP',
                       'State_Max_Temp':'DEST_MAX_TEMP'} , inplace=True)

In [20]:
df.columns[-6:]

Index(['ORIGIN_AVG_TEMP', 'ORIGIN_MIN_TEMP', 'ORIGIN_MAX_TEMP',
       'DEST_AVG_TEMP', 'DEST_MIN_TEMP', 'DEST_MAX_TEMP'],
      dtype='object')

_____
# Save file

In [21]:
df.to_csv('../../Data/flightsmerged_final.csv', index=False)