# Flight Delays Machine Learning Project

## Importing libraries und data

In [None]:
import pandas as pd
import math
import sklearn
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
flights = pd.read_csv('flights.csv')
flights.head()

## Data preparation

In [None]:
#Selection of range 32-42 to check data cleaning throughout the next steps 

flights.iloc [32:43]

In [None]:
#check for missing values 

flights.isnull().sum()

In [None]:
#Reduce DataFrame by keeping only relevant data

df = flights[["MONTH", "DAY", "DAY_OF_WEEK", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "SCHEDULED_DEPARTURE", "ARRIVAL_DELAY","CANCELLED"]]
df.isnull().sum()

In [None]:
#Assing 10 to every missing value as they represent canceled flights
#Number 10 is chosen as our limit, every flight with a delay ≥ 10 will be treated as late

df = df.fillna({"ARRIVAL_DELAY": 10})

In [None]:
#Add new column "effective delay" ≥ 10 min = 1 as dummy

df.loc[df.ARRIVAL_DELAY >= 10, 'EFFECTIVE_DELAY'] = '1' 
df.loc[df.ARRIVAL_DELAY < 10, 'EFFECTIVE_DELAY'] = '0' 

In [None]:
#Check modifications have been assigned correctly

df.iloc [32:43]

In [None]:
#Reduce Dataframe to Christmas

df = df[df["MONTH"]==12]
df = df[df["DAY"]==25]

In [None]:
#Drop further now redundant columns

df = df.drop(columns= ["ARRIVAL_DELAY","MONTH","DAY","DAY_OF_WEEK","CANCELLED"])

In [None]:
#Reduce dataframe to have only 10 destination and origin airports

df2 = df[(df.DESTINATION_AIRPORT == "ATL")|(df.DESTINATION_AIRPORT == "ORD")|(df.DESTINATION_AIRPORT == "LAX")|(df.DESTINATION_AIRPORT == "DFW")|(df.DESTINATION_AIRPORT == "JFK")|(df.DESTINATION_AIRPORT == "DEN")|(df.DESTINATION_AIRPORT == "SFO")|(df.DESTINATION_AIRPORT == "LAS")|(df.DESTINATION_AIRPORT == "PHX")|(df.DESTINATION_AIRPORT == "IAH")]
df3 = df2[(df2.ORIGIN_AIRPORT == "ATL")|(df2.ORIGIN_AIRPORT == "ORD")|(df2.ORIGIN_AIRPORT == "LAX")|(df2.ORIGIN_AIRPORT == "DFW")|(df2.ORIGIN_AIRPORT == "JFK")|(df2.ORIGIN_AIRPORT == "DEN")|(df2.ORIGIN_AIRPORT == "SFO")|(df2.ORIGIN_AIRPORT == "LAS")|(df2.ORIGIN_AIRPORT == "PHX")|(df2.ORIGIN_AIRPORT == "IAH")]

In [None]:
#Quantization of departure hours to 24 values (hours) to improve accuracy in machine learning

for index, row  in df3.iterrows():
    df3.loc[index, "SCHEDULED_DEPARTURE"] = math.floor(row["SCHEDULED_DEPARTURE"]/100)
    

In [None]:
#Create dummy variables for the Airports

df3 = pd.get_dummies (df3, columns= ["ORIGIN_AIRPORT", "DESTINATION_AIRPORT"])

In [None]:
#Check modifications have been assigned correctly

df3.iloc [32:43]

## Build machine learning model

In [None]:
#train-test split of dataframe in 70%/30% ratio

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df3.drop('EFFECTIVE_DELAY', axis=1), df3['EFFECTIVE_DELAY'], test_size=0.3, random_state = 10)

In [None]:
#DataFrame containing the feature columns used for training

train_x.shape

In [None]:
#DataFrame containing the feature columns used for testing

test_x.shape

In [None]:
#Create a RandomForestClassifier object and train it

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=10)
model.fit(train_x, train_y)

## Accuracy Measurements

In [None]:
#Measure mean accuracy of the model

predicted = model.predict(test_x)
model.score(test_x,test_y)

In [None]:
#Measure accuracy of the classification model with ROC AUC score

from sklearn.metrics import roc_auc_score
probabilities = model.predict_proba(test_x)
roc_auc_score(test_y, probabilities [:,1])

In [None]:
#Generate an error matrix to measure the amount of correctly or incorrectly classified answers

from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, predicted)

In [None]:
#Measure the precision of the model

from sklearn.metrics import precision_score
train_predictions = model.predict(train_x)
precision_score(train_y, train_predictions, average="binary", pos_label="1")

In [None]:
#Recall score to measure ability of the classifier to find all the positive samples

from sklearn.metrics import recall_score
recall_score(train_y, train_predictions, average="binary", pos_label="1")

## Visualize model output

In [None]:
#Render roc curve to visualize accuracy of the classification model

from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(test_y, probabilities[:, 1], pos_label='1')
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

In [None]:
#Function to predict the likelihood of a flight arriving on time 

def predict_delay(scheduled_departure, origin_airport, destination_airport):
    from datetime import datetime

    try:
        scheduled_departure_parsed = datetime.strptime(scheduled_departure, "%H:%M")
    except ValueError as e:
        return 'Error parsing date/time - {}'.format(e)

    hour = scheduled_departure_parsed.hour

    input = [{'SCHEDULED_DEPARTURE': hour,
              'ORIGIN_AIRPORT_ATL': 1 if origin_airport == 'ATL' else 0,
              'ORIGIN_AIRPORT_ORD': 1 if origin_airport == 'ORD' else 0,
              'ORIGIN_AIRPORT_LAX': 1 if origin_airport == 'LAX' else 0,
              'ORIGIN_AIRPORT_DFW': 1 if origin_airport == 'DFW' else 0,
              'ORIGIN_AIRPORT_JFK': 1 if origin_airport == 'JFK' else 0,
              'ORIGIN_AIRPORT_DEN': 1 if origin_airport == 'DEN' else 0,
              'ORIGIN_AIRPORT_SFO': 1 if origin_airport == 'SFO' else 0,
              'ORIGIN_AIRPORT_LAS': 1 if origin_airport == 'LAS' else 0,
              'ORIGIN_AIRPORT_PHX': 1 if origin_airport == 'PHX' else 0,
              'ORIGIN_AIRPORT_IAH': 1 if origin_airport == 'IAH' else 0,
              'DESTINATION_AIRPORT_ATL': 1 if destination_airport == 'ATL' else 0,
              'DESTINATION_AIRPORT__ORD': 1 if destination_airport == 'ORD' else 0,
              'DESTINATION_AIRPORT__LAX': 1 if destination_airport == 'LAX' else 0,
              'DESTINATION_AIRPORT__DFW': 1 if destination_airport == 'DFW' else 0,
              'DESTINATION_AIRPORT__JFK': 1 if destination_airport == 'JFK' else 0,
              'DESTINATION_AIRPORT__DEN': 1 if destination_airport == 'DEN' else 0,
              'DESTINATION_AIRPORT__SFO': 1 if destination_airport == 'SFO' else 0,
              'DESTINATION_AIRPORT__LAS': 1 if destination_airport == 'LAS' else 0,
              'DESTINATION_AIRPORT__PHX': 1 if destination_airport == 'PHX' else 0,
              'DESTINATION_AIRPORT__IAH': 1 if destination_airport == 'IAH' else 0 }]

    return model.predict_proba(pd.DataFrame(input))[0][0]

In [None]:
predict_delay("12:45", "LAX", "SFO")

In [None]:
import random #importing random to use in randon selection of list element

#GENERAL FUNCTIONS DEFINITON

def ask_input(txt):#returns input in list format
  return input(txt).split()

def yesorno():#used when a yes or no answer is needed
  a = input("Answer by yes or no: ")
  while True:
    if a.lower() == "yes":
      return True
      break
    elif a.lower() == "no":
      return False
      break
    else:
      a = input("I didn't understand. Please only answer by yes or no! Try again: ")

#BEGINNING OF THE MAIN TRUNK OF THE CONVERSATION

#STAGE 1: checks if a salutation word is in the user input. If condition passed, calls pizza_recipe()
def random_salutation(txt):
    i = 0
    sentence = ask_input(txt)
    for s in range(len(sentence)):
        if sentence[s].lower() in GREETINGS_KEYWORDS:
            txt = random.choice(GREETINGS_RESPONSES)
            txt+= " welcome to our flight delay calculator! "
            print(txt)
            airport_list()
            i = 1
            break
    if i == 0:
        print("Are you not going to greet our bot?")
        random_salutation("Try again with salutations: ")
        
#STAGE 2: asks the user if they want to see the the airport lists before calculatin. YES --> shows list; NO --> calls pizza_order()
def airport_list():
  print("Before we can calculate your delay, " + random.choice(AIRPORT_QUESTIONS))
  if yesorno():
    print("The available airports are: ", ", ".join(AIRPORT_LIST))
    airport_departure("Please enter the airport you are departing from : ")
  else:
    print("Okay")
    airport_departure("Please enter the airport you are departing from : ")
    
    
#STAGE 4: Main part of the code, where the user orders the pizza. The function checks if the pizza is in PIZZA_LIST: YES --> calls pizza_number(); NO --> if ORDERED_PIZZA is empty, meaning no matching pizza was found in the input, it asks the user to order again
def airport_departure(txt1):
	i = 0
	global strX
	x = ask_input(txt1)
	for word in x:
		if word.upper() in AIRPORT_LIST:
			airport_arrival("Please enter the airport you are going to :")
			i = 1
			strX = ' '.join([str(elem) for elem in x])
			break
	if i == 0:
		print("Sorry, this airport is not available for our machine learning. Make sure you wrote the name correctly!")
		airport_departure("Try to enter the right name this time: ")
    
        
def airport_arrival(txt):
	i = 0
	global strS
	s = ask_input(txt)
	for word in s:
		if word.upper() in AIRPORT_LIST:           
			i = 1
			strS = ' '.join([str(elem) for elem in s])
			break
	if i == 0:
		print("Sorry, this airport is not available for our machine learning. Make sure you wrote the name correctly!")
		airport_departure("Try to enter the right name this time: ")

        
GREETINGS_KEYWORDS=["hello", "hi", "hey", "greetings", "ciao", "salut", "hallo"]
GREETINGS_RESPONSES=["Hello,", "Hi,", "Hey,", "Greetings,", "Nice to see you here,"]
AIRPORT_LIST = ["ATL", "ORD", "LAX", "DFW", "JFK", "DEN", "SFO", "LAS", "PHX", "IAH"]
AIRPORT_QUESTIONS = ["do you want to see the airports?", "do you want to take a look the airports?", "do you want to see our list of airports?"]

#begins the program by calling STAGE 1 function
random_salutation("Welcome to our chatbot. Please greet him: ")

predict_delay("12:45", strX, strS)