In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, confusion_matrix, 
    precision_score, recall_score, 
    roc_curve, auc
)
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [18]:
#Denne funktion bestemmer hvilket label "DelayLabel" ender på basseret på forsinkelsen 
def label_delay(delay):
    if delay <= 15:
        return 'on-time'
    elif delay <= 120:
        return 'late'
    else:
        return 'very-late'

In [19]:
#Henter vores datasæt og laver det til pandas dataframe
df = pd.read_csv('Combined_Flights_2022.csv', nrows = 90000)

In [20]:
#DelayLabel bliver tilføjet og apply bruger funktionen label_delay på hele rækken
df['DelayLabel'] = df['ArrDelayMinutes'].apply(label_delay)

In [21]:
#Definere de kolonner vi gerne vil træne på
relevant_columns = ['Airline', 'Origin', 'Dest', 
                    'DepTime', 'ArrTime', 'DelayLabel', 
                    'Distance', 'DayOfWeek', 'DayofMonth', 'Quarter']

#Beholder kun de data vi vil træne på
df = df[relevant_columns]

In [22]:
#One-hot encoder vores koloner
df = pd.get_dummies(df, columns=['Airline', 'Origin', 'Dest'], dtype=int)

In [23]:
#skalere vores koloner
scaler = StandardScaler()
columns_to_normalize = ["DepTime", "ArrTime", 'Distance']
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

In [24]:
# fjerner alle rækker med tomme felter
rows_before = len(df)
df.dropna(inplace=True)
rows_after = len(df)
rows_removed = rows_before - rows_after
print(f"Fjernet {rows_removed} rækker.")

Fjernet 1899 rækker.


In [25]:
#fjerne DelayLabel fra df og gemmer dem som label
label = df.pop("DelayLabel")

In [26]:
#Laver et 80/20 split på vores data og labels
train_x, test_x, train_y, test_y = train_test_split(df, label, stratify=label, test_size=0.20, random_state=1)

In [40]:
#definere og fitter vores model
dtc = RandomForestClassifier()
model = dtc.fit(train_x, train_y)
cross_val = cross_val_score(dtc, train_x, train_y, cv=10, scoring="f1_micro")
cross_val

array([0.8084563 , 0.81171964, 0.81796254, 0.80434166, 0.79866629,
       0.80448354, 0.81157775, 0.80590238, 0.81044268, 0.81171964])

In [28]:
#forudsiger labels på vores test sæt
predicted_values = dtc.predict(test_x)

In [29]:
#sammenligner vores test resultater og vores predicted_values
acc = accuracy_score(test_y, predicted_values)
conf_matrix = confusion_matrix(test_y, predicted_values)
prec = precision_score(test_y, predicted_values, average='weighted')
rec = recall_score(test_y, predicted_values, average='weighted')

In [30]:
# Forudsige sandsynligheder for test-sættet
predicted_probabilities = dtc.predict_proba(test_x)

# Udskrive de forudsagte sandsynligheder for de første 5 test-eksempler
print(predicted_probabilities[:5])

[[0.07 0.93 0.  ]
 [0.09 0.9  0.01]
 [0.02 0.98 0.  ]
 [0.18 0.82 0.  ]
 [0.01 0.99 0.  ]]


In [31]:
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")

classes = model.classes_
print("Confusion Matrix:")
print(f"Labels: {classes}")
print(conf_matrix)

Accuracy: 0.8156177288462629
Precision: 0.80456083903111
Recall: 0.8156177288462629
Confusion Matrix:
Labels: ['late' 'on-time' 'very-late']
[[ 2306  2218    20]
 [  626 12004     6]
 [  237   142    62]]
