# Flight Delays Christmas - Machine Learning Project

## Importing libraries und data

In [None]:
import pandas as pd
import math
import sklearn
import random
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
#Import Data from CSV-file
flights = pd.read_csv('flights.csv')
flights.head()

## Data preparation

In [None]:
#Selection of range 32-42 to check data cleaning throughout the next steps 

flights.iloc [32:43]

In [None]:
#check for missing values 

flights.isnull().sum()

In [None]:
#Reduce DataFrame by keeping only relevant data

df = flights[["MONTH", "DAY", "DAY_OF_WEEK", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "SCHEDULED_DEPARTURE", "ARRIVAL_DELAY","CANCELLED"]]
df.isnull().sum()

In [None]:
#Assing 10 to every missing value as they represent canceled flights
#Number 10 is chosen as our limit, every flight with a delay ≥ 10 will be treated as late

df = df.fillna({"ARRIVAL_DELAY": 10})

In [None]:
#Add new column "effective delay" ≥ 10 min = 1 as dummy

df.loc[df.ARRIVAL_DELAY >= 10, 'EFFECTIVE_DELAY'] = '1' 
df.loc[df.ARRIVAL_DELAY < 10, 'EFFECTIVE_DELAY'] = '0' 

In [None]:
#Check modifications have been assigned correctly

df.iloc [32:43]

In [None]:
#Drop further now redundant columns

df = df.drop(columns= ["ARRIVAL_DELAY","MONTH","DAY","DAY_OF_WEEK","CANCELLED"])

In [None]:
#Reduce dataframe to have only the 10 busiest airports in the United States as destination and origin airport

df2 = df[(df.DESTINATION_AIRPORT == "ATL")|(df.DESTINATION_AIRPORT == "ORD")|(df.DESTINATION_AIRPORT == "LAX")|(df.DESTINATION_AIRPORT == "DFW")|(df.DESTINATION_AIRPORT == "JFK")|(df.DESTINATION_AIRPORT == "DEN")|(df.DESTINATION_AIRPORT == "SFO")|(df.DESTINATION_AIRPORT == "LAS")|(df.DESTINATION_AIRPORT == "PHX")|(df.DESTINATION_AIRPORT == "IAH")]
df3 = df2[(df2.ORIGIN_AIRPORT == "ATL")|(df2.ORIGIN_AIRPORT == "ORD")|(df2.ORIGIN_AIRPORT == "LAX")|(df2.ORIGIN_AIRPORT == "DFW")|(df2.ORIGIN_AIRPORT == "JFK")|(df2.ORIGIN_AIRPORT == "DEN")|(df2.ORIGIN_AIRPORT == "SFO")|(df2.ORIGIN_AIRPORT == "LAS")|(df2.ORIGIN_AIRPORT == "PHX")|(df2.ORIGIN_AIRPORT == "IAH")]

In [None]:
#Quantization of departure hours to 24 values (hours) to improve accuracy in machine learning

for index, row  in df3.iterrows():
    df3.loc[index, "SCHEDULED_DEPARTURE"] = math.floor(row["SCHEDULED_DEPARTURE"]/100)
    

In [None]:
#Create dummy variables for the Airports

df3 = pd.get_dummies (df3, columns= ["ORIGIN_AIRPORT", "DESTINATION_AIRPORT"])

In [None]:
#Check modifications have been assigned correctly

df3.iloc [32:43]

## Build machine learning model

In [None]:
#train-test split of dataframe in 70%/30% ratio

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df3.drop('EFFECTIVE_DELAY', axis=1), df3['EFFECTIVE_DELAY'], test_size=0.3, random_state = 10)

In [None]:
#DataFrame containing the feature columns used for training

train_x.shape

In [None]:
#DataFrame containing the feature columns used for testing

test_x.shape

In [None]:
#Create a RandomForestClassifier object and train it

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=10)
model.fit(train_x, train_y)

## Accuracy Measurements

In [None]:
#Measure mean accuracy of the model

predicted = model.predict(test_x)
model.score(test_x,test_y)

In [None]:
#Measure accuracy of the classification model with ROC AUC score

from sklearn.metrics import roc_auc_score
probabilities = model.predict_proba(test_x)
roc_auc_score(test_y, probabilities [:,1])

In [None]:
#Generate an error matrix to measure the amount of correctly or incorrectly classified answers

from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, predicted)

In [None]:
#Measure the precision of the model

from sklearn.metrics import precision_score
train_predictions = model.predict(train_x)
precision_score(train_y, train_predictions, average="binary", pos_label="1")

In [None]:
#Recall score to measure ability of the classifier to find all the positive samples

from sklearn.metrics import recall_score
recall_score(train_y, train_predictions, average="binary", pos_label="1")

## Visualize model output

In [None]:
#Render roc curve to visualize accuracy of the classification model

from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(test_y, probabilities[:, 1], pos_label='1')
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

## Predict probability of on-time-arrival 

In [None]:
#Function to predict the likelihood of a flight arriving on time 

def predict_delay(scheduled_departure, origin_airport, destination_airport):
    from datetime import datetime

    try:
        scheduled_departure_parsed = datetime.strptime(scheduled_departure, "%H:%M")
    except ValueError as e:
        return 'Error parsing date/time - {}'.format(e)

    hour = scheduled_departure_parsed.hour

    input = [{'SCHEDULED_DEPARTURE': hour,
              'ORIGIN_AIRPORT_ATL': 1 if origin_airport == 'ATL' else 0,
              'ORIGIN_AIRPORT_ORD': 1 if origin_airport == 'ORD' else 0,
              'ORIGIN_AIRPORT_LAX': 1 if origin_airport == 'LAX' else 0,
              'ORIGIN_AIRPORT_DFW': 1 if origin_airport == 'DFW' else 0,
              'ORIGIN_AIRPORT_JFK': 1 if origin_airport == 'JFK' else 0,
              'ORIGIN_AIRPORT_DEN': 1 if origin_airport == 'DEN' else 0,
              'ORIGIN_AIRPORT_SFO': 1 if origin_airport == 'SFO' else 0,
              'ORIGIN_AIRPORT_LAS': 1 if origin_airport == 'LAS' else 0,
              'ORIGIN_AIRPORT_PHX': 1 if origin_airport == 'PHX' else 0,
              'ORIGIN_AIRPORT_IAH': 1 if origin_airport == 'IAH' else 0,
              'DESTINATION_AIRPORT_ATL': 1 if destination_airport == 'ATL' else 0,
              'DESTINATION_AIRPORT__ORD': 1 if destination_airport == 'ORD' else 0,
              'DESTINATION_AIRPORT__LAX': 1 if destination_airport == 'LAX' else 0,
              'DESTINATION_AIRPORT__DFW': 1 if destination_airport == 'DFW' else 0,
              'DESTINATION_AIRPORT__JFK': 1 if destination_airport == 'JFK' else 0,
              'DESTINATION_AIRPORT__DEN': 1 if destination_airport == 'DEN' else 0,
              'DESTINATION_AIRPORT__SFO': 1 if destination_airport == 'SFO' else 0,
              'DESTINATION_AIRPORT__LAS': 1 if destination_airport == 'LAS' else 0,
              'DESTINATION_AIRPORT__PHX': 1 if destination_airport == 'PHX' else 0,
              'DESTINATION_AIRPORT__IAH': 1 if destination_airport == 'IAH' else 0 }]

    return model.predict_proba(pd.DataFrame(input))[0][0]

## Conversation bot for user input 

In [None]:
# Lists for conversation bot

greetings = ["hello", "hi", "hey", "greetings", "ciao", "salut", "hallo"]
greeting_responses = ["\nHello,", "\nHi,", "\nHey,", "\nGreetings,", "\nNice to see you here,"]
airport_list = ["ATL", "ORD", "LAX", "DFW", "JFK", "DEN", "SFO", "LAS", "PHX", "IAH"]
AIRPORT_QUESTIONS = ["do you want to see the available airports?", "do you want to take a look at the available airports?", "do you want to see our list of available airports?"]

In [None]:
#Function to return input

def ask_input(txt):
  return input(txt).split()

In [None]:
#Function for Yes or No 

def yes_or_no():
  answer_yes_or_no = input("Please answer by yes or no: ")
  while True:
    if answer_yes_or_no.lower() == "yes":
      return True
      break
    elif answer_yes_or_no.lower() == "no":
      return False
      break
    else:
      answer_yes_or_no = input("Oops, something went wrong here. Please only answer by yes or no! You may try again now: ")

In [None]:
#Function for bot greeting and call airport_possibilities 

def random_greeting(txt):
    i = 0
    sentence = ask_input(txt)
    for s in range(len(sentence)):
        if sentence[s].lower() in greetings:
            greeting_answer = random.choice(greeting_responses)
            greeting_answer += " welcome to our flight delay calculator. Thanks to us you'll be home on time for Christmas! "
            print(greeting_answer)
            airport_possibilities()
            i = 1
            break
    if i == 0:
        print("\nOops, our bot doesn't work unless greeted")
        random_greeting("To find out which plane to take to be home on time for Christmas, please try again with a greeting: ")

In [None]:
#Function to display airport_list and call airport_departure

def airport_possibilities():
  print("\nBefore we can calculate your delay, " + random.choice(AIRPORT_QUESTIONS))
  if yes_or_no():
    print("\nThe available airports are: ", ", ".join(airport_list))
    airport_departure("Please enter the airport you are departing from : ")
  else:
    print("\nOkay")
    airport_departure("Please enter the airport you are departing from : ")

In [None]:
#Function to check whether airport_departure is in list, store input as global variable and call airport_arrival  

def airport_departure(chosen_airport_departure):
    global str_chosen_airport_departure
    i = 0
    chosen_airport_departure = ask_input(chosen_airport_departure)
    for word in chosen_airport_departure:
        if word.upper() in airport_list:
            airport_arrival("\nPlease enter the airport you are going to :")
            i = 1
            str_chosen_airport_departure = ' '.join([str(elem) for elem in chosen_airport_departure])
            break
    if i == 0:
        print("\nSorry, this airport is not available for our machine learning. Make sure you write the name correctly!")
        airport_departure("Try to enter the right name this time: ")

In [None]:
#Function to check whether airport_arrival is in list and store input as global variable

def airport_arrival(chosen_airport_arrival):
    global str_chosen_airport_arrival
    i = 0
    chosen_airport_arrival = ask_input(chosen_airport_arrival)
    for word in chosen_airport_arrival:
        if word.upper() in airport_list:           
            i = 1
            str_chosen_airport_arrival = ' '.join([str(elem) for elem in chosen_airport_arrival])
            break
    if i == 0:
        print("\nSorry, this airport is not available for our machine learning. Make sure you write the name correctly!")
        airport_arrival("Try to enter the right name this time: ")

## Plot on-time-arrival for user input

In [None]:
#Start conversation bot and plot predictions

random_greeting("Welcome to our chatbot. Please greet him: ")

print("\nGreat, you have chosen to go from %s to %s. Since Christmas is a busy time at the airport we predicted the probability of on-time-arrival for your trip for every hour. Choose wisely and have a safe trip. Merry Christmas!" %(str_chosen_airport_departure.upper(), str_chosen_airport_arrival.upper()))
labels = ('0:00', '1:00', '2:00', '3:00', '4:00', '5:00', '6:00', '7:00', '8:00', '9:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00')
values = (predict_delay("0:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("1:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("2:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("3:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("4:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("5:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("6:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("7:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("8:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("9:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("10:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("11:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("12:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("13:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("14:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("15:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("16:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("17:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("18:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("19:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("20:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("21:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("22:00", str_chosen_airport_departure, str_chosen_airport_arrival),
         predict_delay("22:00", str_chosen_airport_departure, str_chosen_airport_arrival))    
alabels = np.arange(len(labels))

f, ax = plt.subplots(figsize=(18,5))
plt.bar(alabels, values, align='center', alpha=0.5)
plt.xticks(alabels, labels)
plt.ylabel('Probability of On-Time Arrival')
plt.ylim((0.0, 1.0))