# Exploratory Data Analysis

In [None]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from scipy.stats import kurtosis
from scipy.stats import skew

sns.set_style('whitegrid')

pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Data Perspective
- One variable
    - Numeric variables:
        - continuous: average income
        - discrete: population
    - Categorical variables:
        - ordinal: grade
        - nominal: house, apartment, townhouse, etc.
- Multiple variables:
    - Numeric x Numeric
    - Categorical x Numeric
    - Categorical x Categorical

## 1. Load Cleaned Data

In [None]:
df = pd.read_csv('data/flightsmerged.csv', low_memory=False)
print(df.columns)
df.head()

## 2. One Variable

### 2.1. Numeric

In [None]:
#Analyze arrival delay distribution

print('Arrival delays (minutes)')
print(df['ARRIVAL_DELAY'].describe())
plt.hist(df['ARRIVAL_DELAY'], bins=100)
plt.show()

print("mean : ", np.mean(df['ARRIVAL_DELAY']))
print("variance  : ", np.var(df['ARRIVAL_DELAY']))
print("skewness : ",df['ARRIVAL_DELAY'].skew())
print("kurtosis : ",df['ARRIVAL_DELAY'].kurtosis())
print('')

print('Departure delays (minutes)')
print(df['DEPARTURE_DELAY'].describe())
plt.hist(df['DEPARTURE_DELAY'], bins=100)
plt.show()

print("mean : ", np.mean(df['DEPARTURE_DELAY']))
print("variance  : ", np.var(df['DEPARTURE_DELAY']))
print("skewness : ",df['DEPARTURE_DELAY'].skew())
print("kurtosis : ",df['DEPARTURE_DELAY'].kurtosis())

##### Conclusion: The arrival and departure delay data is highly positively skewed with heavy outliers 

### 2.2. Categorical

In [None]:
print(df['CANCELLATION_REASON'].value_counts())

# Bar Chart
plt.figure(figsize=(12,6))
plt.title('# Flight Cancelled')
plt.xlabel('Cancellation Reasons')
plt.ylabel('# Flights')

labels = df['CANCELLATION_REASON'].value_counts().index
values = df['CANCELLATION_REASON'].value_counts().values

y_pos = range(len(labels))
plt.bar(y_pos, values, align='center', alpha=0.5, color=['red','purple','navy','gray'])
plt.xticks(y_pos, labels)

plt.show()

#### Conclusion: 'Weather' is the main reason of flight cancellation.

## 3. Multiple Variables

### 3.1. Numeric X Numeric

In [None]:
#Correlation

correlation_matrix = df.corr()
plt.figure(figsize=(16, 9))
sns.heatmap(correlation_matrix, square=True)
plt.show()


### 3.2. Numeric X Categorical

In [None]:
# Linechart of arrival delay distribution by month 
f,ax=plt.subplots(1,2,figsize=(12,6))

df[['MONTH','ARRIVAL_DELAY']].groupby(['MONTH']).mean().plot(ax=ax[0])
ax[0].set_title('Average delay per month')

df[['MONTH','ARRIVAL_DELAY']].groupby(['MONTH']).sum().plot(ax=ax[1],color=['purple'])
ax[1].set_title('Delay (minutes) per month')
plt.show()

###### Conclusion: April has the lowest average arrival delays and June has highest 

## 4. Business Prespective

### 4.1. What is the average delay for each airline ?

In [None]:

airline_arr_group = df.groupby('AIRLINE_NAME')['ARRIVAL_DELAY'].mean().round().sort_values()
airline_dep_group = df.groupby('AIRLINE_NAME')['DEPARTURE_DELAY'].mean().round().sort_values()

#Plotting
pd.concat(
    [airline_arr_group.rename('Mean Arrival Delay'), airline_dep_group.rename('Mean Departure Delay')],
    axis=1, sort=False).plot.bar(figsize=(16,8), title='Mean Delays for Airlines' )


In [None]:
#Another Visualization

fig = plt.figure(figsize=(12, 16))
ax1 = plt.subplot(211)

plt.title('Average Arrival Delays for Airlines')
plt.xlabel('Average delay per minutes')
plt.ylabel('Airlines Names')
sns.barplot(x=airline_arr_group.values, y=airline_arr_group.index, orient='h')

ax2 = plt.subplot(212, sharex = ax1, sharey = ax1)

plt.title('Average Departure Delays for Airlines')
plt.xlabel('Average delay per minutes')
plt.ylabel('Airlines Names')
sns.barplot(x=airline_dep_group.values, y=airline_dep_group.index, orient='h')

plt.show()


### 4.2. What is the average arrival and departure delay times based on airport ?

In [None]:

origin_airport_group = df.groupby(by='ORIGIN_AIRPORT')['DEPARTURE_DELAY'].mean().round().sort_values()
dest_airport_group = df.groupby(by='DESTINATION_AIRPORT')['ARRIVAL_DELAY'].mean().round().sort_values()

print("Average Departure delays:\n",origin_airport_group)
print("_______________________\n")
print("Average Arrival delays:\n",dest_airport_group)


##### Conclusion:
1. Flights departing from Gustavus Airport have the highest average delays.
2. Flights arriving to St. Cloud Regional Airport have the highest average delays.
3. Flights depart early from Valdez Airport, and also arrive early to it.

### 4.3. What is the impact of the weather on flights?

In [None]:
#As discussed above, weather is the most common reason for cancellation.
#For more illustration:

month_cancel_group = df.groupby('MONTH')['CANCELLED'].sum().sort_values()
print(month_cancel_group)

plt.figure(figsize=(12,7))

plt.title('Cancelled flights per month')
plt.xlabel('# Cancelled flights')
plt.ylabel('Month')
sns.barplot(x=month_cancel_group.values, y=month_cancel_group.index, orient='h')

plt.show()

#### Feb has the higher number of cancelled flights, then Jan and March - Winter.