In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import joblib
from sklearn.metrics import (
    accuracy_score, confusion_matrix, 
    precision_score, recall_score, 
    roc_curve, auc
)
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import warnings
from imblearn.over_sampling import SMOTE
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
#Denne funktion bestemmer hvilket label "DelayLabel" ender på basseret på forsinkelsen 
def label_delay(delay):
    if delay <= 15:
        return 'on-time'
    elif delay <= 45:
        return 'late'
    else:
        return 'very-late'

In [4]:
#Henter vores datasæt og laver det til pandas dataframe
df = pd.read_csv('Combined_Flights_2022.csv', nrows = 1000000)

In [5]:
#DelayLabel bliver tilføjet og apply bruger funktionen label_delay på hele rækken
df['DelayLabel'] = df['ArrDelayMinutes'].apply(label_delay)

In [6]:
#Definere de kolonner vi gerne vil træne på
relevant_columns = ['Airline', 'Origin', 'Dest',
                    'DepTime', 'ArrTime', 'DelayLabel', 
                    'Distance', 'DayOfWeek', 'DayofMonth', 'Quarter']

#Beholder kun de data vi vil træne på
df = df[relevant_columns]

In [7]:
# fjerner alle rækker med tomme felter
rows_before = len(df)
df.dropna(inplace=True)
rows_after = len(df)
rows_removed = rows_before - rows_after
print(f"Fjernet {rows_removed} rækker.")

Fjernet 31742 rækker.


In [8]:
#One-hot encoder vores koloner
df = pd.get_dummies(df, columns=['Airline', 'Origin', 'Dest'], dtype=int, sparse=True)

In [9]:
#skalere vores koloner
scaler = StandardScaler()
columns_to_normalize = ["DepTime", "ArrTime", 'Distance']
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

In [10]:
#fjerne DelayLabel fra df og gemmer dem som label
label = df.pop("DelayLabel")

In [11]:
#Laver et 80/20 split på vores data og labels
train_x, test_x, train_y, test_y = train_test_split(df, label, stratify=label, test_size=0.20, random_state=1)

In [12]:
train_y.value_counts()

DelayLabel
on-time      615796
late          85806
very-late     73004
Name: count, dtype: int64

In [13]:
# SMOTE initialisering
smote = SMOTE(random_state=1)

# Brug SMOTE til at over-sample de underrepræsenterede klasser i træningssættet
balanced_train_x, balanced_train_y = smote.fit_resample(train_x, train_y)



KeyboardInterrupt: 

In [None]:
balanced_train_y.value_counts()

DelayLabel
on-time      615796
very-late    615796
late         615796
Name: count, dtype: int64

In [None]:
#definere og fitter vores model
dtc = RandomForestClassifier()
model = dtc.fit(balanced_train_x, balanced_train_y)




In [None]:
cross_val_scores = cross_val_score(dtc, balanced_train_x, balanced_train_y, cv=10, scoring="f1_weighted")
cross_val_scores
print(f"Gennemsnitlig F1-score fra krydsvalidering: {np.mean(cross_val_scores)}")

In [None]:
#forudsiger labels på vores test sæt
predicted_values = dtc.predict(test_x)

In [None]:
#sammenligner vores test resultater og vores predicted_values
acc = accuracy_score(test_y, predicted_values)
conf_matrix = confusion_matrix(test_y, predicted_values)
prec = precision_score(test_y, predicted_values, average='weighted')
rec = recall_score(test_y, predicted_values, average='weighted')

In [None]:
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")

classes = model.classes_
print("Confusion Matrix:")
print(f"Labels: {classes}")
print(conf_matrix)

In [None]:
joblib.dump(dtc, 'min_model.joblib')
