In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, confusion_matrix, 
    precision_score, recall_score, 
    roc_curve, auc
)
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import warnings
from imblearn.over_sampling import SMOTE
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
#Denne funktion bestemmer hvilket label "DelayLabel" ender på basseret på forsinkelsen 
def label_delay(delay):
    if delay <= 15:
        return 'on-time'
    elif delay <= 45:
        return 'late'
    else:
        return 'very-late'

In [3]:
#Henter vores datasæt og laver det til pandas dataframe
df = pd.read_csv('Combined_Flights_2022.csv', nrows = 1000000)

In [4]:
#DelayLabel bliver tilføjet og apply bruger funktionen label_delay på hele rækken
df['DelayLabel'] = df['ArrDelayMinutes'].apply(label_delay)

In [5]:
#Definere de kolonner vi gerne vil træne på
relevant_columns = ['Airline', 'Origin', 'Dest',
                    'DepTime', 'ArrTime', 'DelayLabel', 
                    'Distance', 'DayOfWeek', 'DayofMonth', 'Quarter']

#Beholder kun de data vi vil træne på
df = df[relevant_columns]

In [6]:
# fjerner alle rækker med tomme felter
rows_before = len(df)
df.dropna(inplace=True)
rows_after = len(df)
rows_removed = rows_before - rows_after
print(f"Fjernet {rows_removed} rækker.")

Fjernet 31742 rækker.


In [7]:
#One-hot encoder vores koloner
df = pd.get_dummies(df, columns=['Airline', 'Origin', 'Dest'], dtype=int, sparse=True)

In [8]:
#skalere vores koloner
scaler = StandardScaler()
columns_to_normalize = ["DepTime", "ArrTime", 'Distance']
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

In [9]:
#fjerne DelayLabel fra df og gemmer dem som label
label = df.pop("DelayLabel")

In [10]:
#Laver et 80/20 split på vores data og labels
train_x, test_x, train_y, test_y = train_test_split(df, label, stratify=label, test_size=0.20, random_state=1)

In [11]:
train_y.value_counts()

DelayLabel
on-time      615796
late          85806
very-late     73004
Name: count, dtype: int64

In [12]:
# SMOTE initialisering
smote = SMOTE(random_state=1)

# Brug SMOTE til at over-sample de underrepræsenterede klasser i træningssættet
balanced_train_x, balanced_train_y = smote.fit_resample(train_x, train_y)



In [13]:
balanced_train_y.value_counts()

DelayLabel
on-time      615796
very-late    615796
late         615796
Name: count, dtype: int64

In [14]:
#definere og fitter vores model
dtc = RandomForestClassifier()
model = dtc.fit(balanced_train_x, balanced_train_y)




In [None]:
#cross_val = cross_val_score(dtc, train_x, train_y, cv=10, scoring="f1_micro")
#cross_val

In [None]:
#forudsiger labels på vores test sæt
predicted_values = dtc.predict(test_x)

In [None]:
#sammenligner vores test resultater og vores predicted_values
acc = accuracy_score(test_y, predicted_values)
conf_matrix = confusion_matrix(test_y, predicted_values)
prec = precision_score(test_y, predicted_values, average='weighted')
rec = recall_score(test_y, predicted_values, average='weighted')

In [None]:
# Forudsige sandsynligheder for test-sættet
predicted_probabilities = dtc.predict_proba(test_x)

# Udskrive de forudsagte sandsynligheder for de første 5 test-eksempler
print(predicted_probabilities[:5])

In [None]:
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")

classes = model.classes_
print("Confusion Matrix:")
print(f"Labels: {classes}")
print(conf_matrix)

In [None]:
def print_sorted_feature_importance_and_low_values(model, train_x, threshold=0.005):
    # Henter feature importance fra modellen og par dem med kolonnenavne
    feature_importances = model.feature_importances_
    features = list(train_x.columns)
    feature_importance_with_columns = zip(features, feature_importances)
    
    # Sorterer features baseret på importance
    sorted_feature_importances = sorted(feature_importance_with_columns, key=lambda x: x[1], reverse=True)
    
    # Printer den sorterede liste af feature importance
    print("Feature importance sorteret fra høj til lav:")
    for feature, importance in sorted_feature_importances:
        print(f"{feature}: {importance}")

    # Finder og printer features med en importance-værdi under tærsklen
    print(f"\nFeatures med en importance-værdi under {threshold}:")
    low_value_features = [feature for feature, importance in sorted_feature_importances if importance < threshold]
    
    for feature in low_value_features:
        print(feature)

    print(len(low_value_features))


# Efter din model er trænet, kald denne funktion med din model og træningsdata
print_sorted_feature_importance_and_low_values(model, train_x)
