# REGRESSION

In [17]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# sklearn :: evaluation
from sklearn.metrics import mean_squared_error

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

# Problem definition

Predict the avg. expected delay.

________________________
# Load Data

### Cleaned, Merged dataset can be downloaded from here: https://www.kaggle.com/arwasheraky/cleaned-flight-delays-2015

In [3]:
df_flights = pd.read_csv('../../Data/flightsmerged.csv', low_memory=False)

In [4]:
df_flights.head()

Unnamed: 0,MONTH,DAY,FLIGHT_NUMBER,TAIL_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,...,ORIGIN_AC,ORIGIN_NAME,ORIGIN_STATE,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_AC,DESTINATION_NAME,DEST_STATE,DEST_LATITUDE,DEST_LONGITUDE
0,1,1,98,N407AS,00:05:00,23:54:00,-11.0,205.0,1448,04:30:00,...,ANC,Ted Stevens Anchorage International Airport,AK,61.17,-150.0,SEA,Seattle-Tacoma International Airport,WA,47.45,-122.31
1,1,1,2336,N3KUAA,00:10:00,00:02:00,-8.0,280.0,2330,07:50:00,...,LAX,Los Angeles International Airport,CA,33.94,-118.41,PBI,Palm Beach International Airport,FL,26.68,-80.1
2,1,1,840,N171US,00:20:00,00:18:00,-2.0,286.0,2296,08:06:00,...,SFO,San Francisco International Airport,CA,37.62,-122.37,CLT,Charlotte Douglas International Airport,NC,35.21,-80.94
3,1,1,258,N3HYAA,00:20:00,00:15:00,-5.0,285.0,2342,08:05:00,...,LAX,Los Angeles International Airport,CA,33.94,-118.41,MIA,Miami International Airport,FL,25.79,-80.29
4,1,1,135,N527AS,00:25:00,00:24:00,-1.0,235.0,1448,03:20:00,...,SEA,Seattle-Tacoma International Airport,WA,47.45,-122.31,ANC,Ted Stevens Anchorage International Airport,AK,61.17,-150.0


In [5]:
df_flights.columns

Index(['MONTH', 'DAY', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'SCHEDULED_DEPARTURE',
       'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_TIME', 'DISTANCE',
       'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DIVERTED',
       'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY', 'DATE', 'CLASS', 'AIRLINE_CODE', 'AIRLINE_NAME',
       'ORIGIN_AC', 'ORIGIN_NAME', 'ORIGIN_STATE', 'ORIGIN_LATITUDE',
       'ORIGIN_LONGITUDE', 'DESTINATION_AC', 'DESTINATION_NAME', 'DEST_STATE',
       'DEST_LATITUDE', 'DEST_LONGITUDE'],
      dtype='object')

In [16]:
# Source: https://data.world/mattwinter225/2015-usa-weather-avg-max-min
df_weather = pd.read_csv('../../Data/2015_USA_Weather_Data.csv', sep=";")
df_weather

Unnamed: 0,STATION,STATION_NAME,LATITUDE,LONGITUDE,LATLONG,AvgTemp,MaxTemp,MinTemp,StateName,Zip,State,Date
0,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",55.00,58.00,47.00,Massachusetts,1602.00,MA,10/1/15 12:00 AM
1,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",47.00,49.00,44.00,Massachusetts,1602.00,MA,10/2/15 12:00 AM
2,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",45.00,49.00,42.00,Massachusetts,1602.00,MA,10/3/15 12:00 AM
3,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",47.00,53.00,41.00,Massachusetts,1602.00,MA,10/4/15 12:00 AM
4,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",49.00,59.00,44.00,Massachusetts,1602.00,MA,10/5/15 12:00 AM
5,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",55.00,66.00,45.00,Massachusetts,1602.00,MA,10/6/15 12:00 AM
6,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",56.00,66.00,47.00,Massachusetts,1602.00,MA,10/7/15 12:00 AM
7,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",55.00,62.00,48.00,Massachusetts,1602.00,MA,10/8/15 12:00 AM
8,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",56.00,65.00,48.00,Massachusetts,1602.00,MA,10/9/15 12:00 AM
9,GHCND:USW00094746,WORCESTER MA US,42.27,-71.87,"42.2706, -71.8731",51.00,58.00,40.00,Massachusetts,1602.00,MA,10/10/15 12:00 AM


____________
# Preparing Data

In [9]:
df_weather.isnull().sum()

STATION            0
STATION_NAME       0
LATITUDE           0
LONGITUDE          0
LATLONG            0
AvgTemp            0
MaxTemp            0
MinTemp            0
StateName          0
Zip                0
State           1283
Date               0
dtype: int64

In [51]:
def extract_month(str_date):
    
    if type(str_date) != str:
        return str_date
    
    date_month = datetime.strptime(str_date, '%m/%d/%y %I:%M %p')
    date_month = date_month.month
    
    return date_month

In [52]:
df_weather['Month'] = df_weather['Date'].apply(extract_month)
df_weather['Month'].value_counts()

5     71523
7     71378
3     71136
1     71127
8     71050
10    70817
12    70481
4     69126
9     68750
11    68428
2     64274
6     53039
Name: Month, dtype: int64

In [54]:
state_weather = df_weather.groupby(by =['Month','State'])['AvgTemp'].mean()
state_weather

Month  State
1      AK      10.68
       AL      43.97
       AR      37.92
       AZ      40.33
       CA      47.11
       CO      26.26
       CT      26.18
       FL      59.72
       GA      44.38
       HI      65.74
       IA      23.75
       ID      28.31
       IL      27.61
       IN      27.26
       KS      32.86
       KY      33.09
       LA      46.83
       MA      26.08
       MD      30.90
       ME      18.58
       MI      16.54
       MN      12.22
       MO      32.60
       MS      45.02
       MT      26.76
       NC      39.68
       ND      16.79
       NE      27.13
       NH      18.24
       NJ      30.39
                ... 
12     ME      35.81
       MI      35.18
       MN      24.71
       MO      43.17
       MS      57.50
       MT      23.55
       NC      55.22
       ND      21.56
       NE      29.35
       NH      36.37
       NJ      50.48
       NM      27.78
       NV      27.42
       NY      42.30
       OH      43.63
       OK      45.43


____

# Feature Engineering

In [None]:
# Keep just the delayed and on_time flights, remove the cancelled and early flights.

df = df_flights[(df_flights['CLASS'] == 'On_Time') | (df_flights['CLASS'] == 'Delayed')]
df = df.drop(columns = ['CANCELLATION_REASON','CANCELLED'])
print("Original dataset : ",df_flights.shape)
print("Now : ",df.shape)

## Add a column

In [None]:
# Average delay column --> Predicited column

df['AVG_DELAY'] = 0
df['AVG_DELAY'] = (df['DEPARTURE_DELAY'] + df['ARRIVAL_DELAY']) / 2.0
df['AVG_DELAY'].head(10)

In [None]:
plt.figure(figsize=(10,6))
plt.hist(df['AVG_DELAY'], bins=50, color='purple')
plt.xlabel("Average Delay")
plt.ylabel("Frequency")
plt.show()

In [None]:
# sns.regplot(x = df['DEPARTURE_DELAY'] , y = df['ARRIVAL_DELAY'], color = "r")
# plt.title("DEPARTURE_DELAY vs ARRIVAL_DELAY")
# plt.xlabel("DEPARTURE_DELAY")
# plt.ylabel("ARRIVAL_DELAY")
# plt.show()

## Label encoding

In [None]:
# get_dummies

# Convert FLIGHT_NUMBER to STRING
#for idx in df.index:
#    df.at[idx,'FLIGHT_NUMBER'] = str(df.at[idx,'FLIGHT_NUMBER'])

# Convert MONTH to STRING
df['MONTH'] = df['MONTH'].replace(to_replace = list(range(1, 13)),
                                  value=['1','2','3','4','5','6','7','8','9','10','11','12'])

categorical = ['AIRLINE_NAME','MONTH','DEST_STATE','ORIGIN_STATE','DESTINATION_AC','ORIGIN_AC']

df_dummies = pd.get_dummies(df[categorical])
df = pd.concat([df, df_dummies], axis=1)


In [None]:
df.columns

## Change Time to Int

In [None]:
# Convert the time to seconds

def time_to_num (time_str):
    
    if type(time_str) is int:
        return time_str
    
    h,m,s = time_str.split(':')
    result = int(h) * 3600 + int(m) * 60 + int(s)
    return result

In [None]:
# Call the Function:

df['SCHEDULED_DEPARTURE'] = df['SCHEDULED_DEPARTURE'].apply(time_to_num)
df['SCHEDULED_ARRIVAL'] =df['SCHEDULED_ARRIVAL'].apply(time_to_num)

#Print a sample..
df[['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL']].head()

## Change Date to Int

In [None]:
from datetime import datetime

def date_to_int(time_str):
    
    if type(time_str) is float:
        return time_str
    
    result = datetime.fromisoformat(time_str).timestamp()
    return result

In [None]:
# Call the Function:

df['DATE'] = df['DATE'].apply(date_to_int)
df['DATE'].head()

## Select Model Columns

In [None]:
# After first run: Get the columns with feature importance > 0.0001
#X_columns = features_imp_005[0].values

# Original columns
X_columns = ['SCHEDULED_ARRIVAL','SCHEDULED_DEPARTURE','DATE',] + list(df_dummies.columns)

y_column = ['AVG_DELAY']

len(X_columns)

In [None]:
# splitting the data

threshold = 0.7

X = df[X_columns].fillna(0.0)
y = df[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True, random_state=50)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

In [None]:
# corr_matrix = df.corr()
# corr_matrix.sort_values(by='DEPARTURE_DELAY')

_________
# Training and Testing The Models

In [None]:
# Select a subset of the data, to avoid waiting long time beacause of it's enormous size.

X_train = X_train.iloc[:200000,:]
y_train = y_train.iloc[:200000,:]
X_test = X_test.iloc[:50000,:]
y_test = y_test.iloc[:50000,:]

In [None]:
rf_model = RandomForestRegressor(n_estimators =20)
rf_model.fit(X_train, y_train.values.ravel())
rf_pred = rf_model.predict(X_test)

In [None]:
gb_model = GradientBoostingRegressor(n_estimators =20)
gb_model.fit(X_train, y_train.values.ravel())
gb_pred = gb_model.predict(X_test)

In [None]:
knn_model = KNeighborsRegressor(10)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

In [None]:
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_test)

____________
# Models Evaluation

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, knn_pred))
print('RMSE', rmse)

plt.figure(figsize=(12,7))
sns.regplot(y_test, rf_pred, color='navy', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('KNN Model')
plt.xlabel('True Value')
plt.ylabel('Predict Value')
plt.show()

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, tree_pred))
print('RMSE', rmse)

plt.figure(figsize=(12,7))
sns.regplot(y_test, rf_pred, color='navy', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Decision Tree Regressor Model')
plt.xlabel('True Value')
plt.ylabel('Predict Value')
plt.show()

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print('RMSE', rmse)

plt.figure(figsize=(12,7))
sns.regplot(y_test, rf_pred, color='navy', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Random Forest Regression')
plt.xlabel('True Value')
plt.ylabel('Predict Value')
plt.show()

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
print('RMSE', rmse)

plt.figure(figsize=(12,7))
sns.regplot(y_test, gb_pred, color = 'green', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Gradient Boosting Regression')
plt.xlabel('True Value')
plt.ylabel('Predict Value')
plt.show()

In [None]:
# identigy the feature importance

importance = []
model = gb_model

for i in range(len(X_columns)):
    importance.append([X_columns[i], model.feature_importances_[i]])

features_imp = pd.DataFrame(importance).sort_values(by=1, ascending=False)
features_imp

In [None]:
features_imp_0001 = features_imp[features_imp[1] > 0.0001]
features_imp_0001

# Prepare submission

In [None]:
from joblib import dump, load

In [None]:
dump(model, '../../Model/gb_model2.pkl') 