# Data Cleaning

In [None]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## 1. Load Datasets

In [None]:
#Source file: https://www.kaggle.com/usdot/flight-delays#flights.csv

#Main dataset
df_delayed_flights = pd.read_csv('data/flights.csv', low_memory=False)

#Complementary datasets
df_airports = pd.read_csv('data/airports.csv', low_memory=False)
df_airlines = pd.read_csv('data/airlines.csv', low_memory=False)

## 2. Summarize the data

In [None]:
print('------- Main Dataset, Flights -------')
print(df_delayed_flights.shape)
print(df_delayed_flights.columns)
print(df_delayed_flights.head())
print(df_delayed_flights.describe())

print('\n ------- Airports -------')
print(df_airports.shape)
print(df_airports.columns)
print(df_airports.head())
print(df_airports.describe())

print('\n ------- Airlines -------')
print(df_airlines.shape)
print(df_airlines.columns)
print(df_airlines.head())
print(df_airlines.describe())

## 3. Data Cleaning

### 3.1. Missing values

In [None]:
print("Delayed Flights Missing Values:\n", df_delayed_flights.isnull().sum())
print("Airlines Missing Values:\n", df_airlines.isnull().sum())
print("Airports Missing Values:\n", df_airports.isnull().sum())

### Missing Values: CANCELLATION_REASON

In [None]:
print("Total number of flights: ", len(df_delayed_flights))
print("Cancelled flights: ")
#Number of missing data in CANCELLATION_REASON is large, because when the flight was not cancelled, no value was added.

#Fill the missing value with new defined value `NC` = Not Cancelled
df_delayed_flights['CANCELLATION_REASON'] = df_delayed_flights['CANCELLATION_REASON'].fillna('NC')
df_delayed_flights['CANCELLATION_REASON'].isnull().sum()
print(df_delayed_flights['CANCELLATION_REASON'][df_delayed_flights['CANCELLATION_REASON'] != 'NC'])


### Missing Values: 

In [None]:
df_merge['ARRIVAL_DELAY'] = df_merge['ARRIVAL_DELAY'].replace('0.00', np.nan)
#This needs work - Unable to remove 0.00 values TCE

#avg_delay = sum(df_delayed_flights['ARRIVAL_DELAY'] != 0) / len(df_delayed_flights['ARRIVAL_DELAY'] != 0)
#print("Avg Arrival Delay =",avg_delay)


### 3.2. Delete unneeded Columns

In [None]:
df_airports = df_airports[['IATA_CODE','AIRPORT']]
print(df_airports.columns)


## 4. Removing Outliers

In [None]:
df_delayed_flights[["DEPARTURE_DELAY","ARRIVAL_DELAY"]].plot.box()
plt.show()

In [None]:
# Excessive Arrival Delays 
plt.hist(df_delayed_flights['ARRIVAL_DELAY'], bins=100)
plt.show()


#print(df_delayed_flights['ARRIVAL_DELAY'].value_counts())
#print('mean', np.mean(df_delayed_flights['ARRIVAL_DELAY']))
#print('std', np.std(df_delayed_flights['ARRIVAL_DELAY']))


#Must remove 0.00 from average and STD TCE

## 5. Merging datasets

In [None]:

#df_delayed_flights["AIRLINE_NAME"]=df_delayed_flights.apply(lambda x: df_airlines.loc[df_airlines['IATA_CODE'] == x["AIRLINE"],"AIRLINE"].values[0],axis=1)
#df_delayed_flights["ORIGIN_AIRPORT_NAME"]=df_delayed_flights.apply(lambda x: df_airports.loc[df_airports['IATA_CODE'] == x["ORIGIN_AIRPORT"],"AIRPORT"].values[0],axis=1)

In [None]:
df_merge = df_delayed_flights.copy()

#Merge Airlines and Flights df on `Airline` column
df_merge = df_merge.merge(df_airlines, on='AIRLINE', how ='left')

#Merge Airports and Merged df on `IATA_CODE` column
df_merge = df_merge.merge(df_airports, on='IATA_CODE', how ='left')

In [None]:
print("Merged Dataframe Columns: \n", df_merge.columns)
print(df_merge)

## Save file

In [None]:
df_merge.to_csv('data/flightsmerged.csv', index=False)