In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

# Reading data
data = pd.read_csv("merged_data.csv")

In [None]:
# *********************************************************************************************************************
# Predict delay bin including 0 delay
# *********************************************************************************************************************

# Loading Inputs
input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score", "time_of_day", "time_diff"]
X = data[input_features]

# Loading Outputs (5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, inf)
y = data["delay_bin"]

# Splitting data into training and testing sets
X_train, X_test = X[:int(len(data)*0.75)], X[int(len(data)*0.75):]
y_train, y_test = y[:int(len(data)*0.75)], y[int(len(data)*0.75):]

# Building the model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9905
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     86275
           1       0.00      0.00      0.00        99
           2       0.00      0.00      0.00        93
           3       0.00      0.00      0.00        87
           4       0.00      0.00      0.00        96
           5       0.00      0.00      0.00        56
           6       0.00      0.00      0.00        36
           7       0.00      0.00      0.00        42
           8       0.00      0.00      0.00        39
           9       0.00      0.00      0.00        37
          10       0.00      0.00      0.00        27
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00        20
          13       0.00      0.00      0.00        13
          14       0.00      0.00      0.00        13
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00        11
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# *********************************************************************************************************************
# Predict delay bin including 0 delay, and using class weights
# *********************************************************************************************************************

# Loading Inputs
input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score", "time_of_day", "time_diff"]
X = data[input_features]

# Loading Outputs (5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, inf)
y = data["delay_bin"]

# Splitting data into training and testing sets
X_train, X_test = X[:int(len(data)*0.75)], X[int(len(data)*0.75):]
y_train, y_test = y[:int(len(data)*0.75)], y[int(len(data)*0.75):]

# Building Class weights
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)
weight_dict = dict(zip(classes, class_weights))

# Showing weights
print(weight_dict)

# Building the model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Mapping each training sample to its class weight
sample_weights = y_train.map(weight_dict)

# Training the model
model.fit(X_train, y_train, sample_weight=sample_weights)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))

{np.int64(0): np.float64(0.04039060960293692), np.int64(1): np.float64(26.06503740648379), np.int64(2): np.float64(33.93532467532467), np.int64(3): np.float64(42.83639344262295), np.int64(4): np.float64(46.45368888888889), np.int64(5): np.float64(67.43277419354838), np.int64(6): np.float64(69.21907284768211), np.int64(7): np.float64(91.68491228070175), np.int64(8): np.float64(101.47650485436893), np.int64(9): np.float64(129.03802469135803), np.int64(10): np.float64(132.30481012658228), np.int64(11): np.float64(160.80123076923076), np.int64(12): np.float64(186.6442857142857), np.int64(13): np.float64(204.9427450980392), np.int64(14): np.float64(282.4886486486486), np.int64(15): np.float64(337.16387096774196), np.int64(16): np.float64(261.302), np.int64(17): np.float64(418.0832), np.int64(18): np.float64(497.71809523809526), np.int64(19): np.float64(418.0832), np.int64(20): np.float64(475.0945454545455), np.int64(21): np.float64(435.50333333333333), np.int64(22): np.float64(653.255), np.

In [55]:
# *********************************************************************************************************************
# Predict delay bin including 0 delay, and balancing 0 delay data to delay data size
# *********************************************************************************************************************

# Extract 0 delay data, and above 0 delay data
no_delay = data[data["yes_delay"] == False]
yes_delay = data[data["yes_delay"]]

# Sampling 0 delay data (THERE IS POTENTIAL DATA LEAKAGE)
no_delay_sample = no_delay.sample(n=len(yes_delay))

# Building training data sets
yes_delay_train = yes_delay[:int(len(yes_delay)*0.75)]
no_delay_train = no_delay_sample[:int(len(no_delay_sample)*0.75)]
train_data = pd.concat([yes_delay_train, no_delay_train], ignore_index=False) # combining datasets
train_data = train_data.sample(frac=1).reset_index(drop=True) # Shuffling training data

input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score", "time_of_day", "time_diff"]
X_train = train_data[input_features]
y_train = train_data["delay_bin"]

# Building testing data sets
yes_delay_test = yes_delay[int(len(yes_delay)*0.75):]
no_delay_test = no_delay_sample[int(len(no_delay_sample)*0.75):]
test_data = pd.concat([yes_delay_test, no_delay_test], ignore_index=False) # combining datasets
test_data = test_data.sample(frac=1).reset_index(drop=True) # Shuffling training data

input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score", "time_of_day", "time_diff"]
X_test = test_data[input_features]
y_test = test_data["delay_bin"]

# Building the model
model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Training the model
model.fit(X_train, y_train)

# Checking Overfitting
y_train_pred = model.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_acc:.4f}")

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose())

Training Accuracy: 0.6275
Accuracy: 0.4934
              precision    recall  f1-score      support
0              0.536634  0.969011  0.690739   839.000000
1              0.073529  0.050000  0.059524   100.000000
2              0.080000  0.042553  0.055556    94.000000
3              0.222222  0.022727  0.041237    88.000000
4              0.142857  0.020619  0.036036    97.000000
5              0.000000  0.000000  0.000000    57.000000
6              0.000000  0.000000  0.000000    38.000000
7              0.000000  0.000000  0.000000    42.000000
8              0.000000  0.000000  0.000000    40.000000
9              0.000000  0.000000  0.000000    37.000000
10             0.000000  0.000000  0.000000    28.000000
11             0.000000  0.000000  0.000000    23.000000
12             0.000000  0.000000  0.000000    20.000000
13             0.000000  0.000000  0.000000    14.000000
14             0.000000  0.000000  0.000000    13.000000
15             0.000000  0.000000  0.000000  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# *********************************************************************************************************************
# Predict delay bin including 0 delay, using class weights, and balancing 0 delay data to delay data size
# *********************************************************************************************************************

# Extract 0 delay data, and above 0 delay data
no_delay = data[data["yes_delay"] == False]
yes_delay = data[data["yes_delay"]]

# Sampling 0 delay data (THERE IS POTENTIAL DATA LEAKAGE)
no_delay_sample = no_delay.sample(n=len(yes_delay))

# Building training data sets
yes_delay_train = yes_delay[:int(len(yes_delay)*0.75)]
no_delay_train = no_delay_sample[:int(len(no_delay_sample)*0.75)]
train_data = pd.concat([yes_delay_train, no_delay_train], ignore_index=False) # combining datasets
train_data = train_data.sample(frac=1).reset_index(drop=True) # Shuffling training data

input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score", "time_of_day", "time_diff"]
X_train = train_data[input_features]
y_train = train_data["delay_bin"]

# Building testing data sets
yes_delay_test = yes_delay[int(len(yes_delay)*0.75):]
no_delay_test = no_delay_sample[int(len(no_delay_sample)*0.75):]
test_data = pd.concat([yes_delay_test, no_delay_test], ignore_index=False) # combining datasets
test_data = test_data.sample(frac=1).reset_index(drop=True) # Shuffling training data

input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score", "time_of_day", "time_diff"]
X_test = test_data[input_features]
y_test = test_data["delay_bin"]

# Building Class weights
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)
weight_dict = dict(zip(classes, class_weights))

# Showing weights
print(weight_dict)

# Building the model
model = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Mapping each training sample to its class weight
sample_weights = y_train.map(weight_dict)

# Training the model
model.fit(X_train, y_train, sample_weight=sample_weights)

# Checking Overfitting
y_train_pred = model.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_acc:.4f}")

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Testing Accuracy: {acc:.4f}")
print(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose())

{np.int64(0): np.float64(0.08), np.int64(1): np.float64(0.5028), np.int64(2): np.float64(0.655114006514658), np.int64(3): np.float64(0.8276543209876543), np.int64(4): np.float64(0.8978571428571429), np.int64(5): np.float64(1.305974025974026), np.int64(6): np.float64(1.349798657718121), np.int64(7): np.float64(1.7642105263157895), np.int64(8): np.float64(1.9717647058823529), np.int64(9): np.float64(2.482962962962963), np.int64(10): np.float64(2.5784615384615384), np.int64(11): np.float64(3.1425), np.int64(12): np.float64(3.5914285714285716), np.int64(13): np.float64(4.0224), np.int64(14): np.float64(5.435675675675676), np.int64(15): np.float64(6.487741935483871), np.int64(16): np.float64(5.028), np.int64(17): np.float64(8.0448), np.int64(18): np.float64(9.577142857142857), np.int64(19): np.float64(8.0448), np.int64(20): np.float64(9.141818181818183), np.int64(21): np.float64(8.38), np.int64(22): np.float64(12.57), np.int64(23): np.float64(13.408), np.int64(24): np.float64(0.852203389830

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# *********************************************************************************************************************
# Build one model to predict yes/no delay
# Build another model to predict time delay bin if yes delay
# *********************************************************************************************************************

# REBUILDING MODEL AND DATASETS

# Extract 0 delay data, and above 0 delay data
no_delay = data[data["yes_delay"] == False]
yes_delay = data[data["yes_delay"]]

# Sampling 0 delay data (THERE IS POTENTIAL DATA LEAKAGE)
no_delay_sample = no_delay.sample(n=len(yes_delay))

# Building training data sets
yes_delay_train = yes_delay[:int(len(yes_delay)*0.75)]
no_delay_train = no_delay_sample[:int(len(no_delay_sample)*0.75)]
train_data = pd.concat([yes_delay_train, no_delay_train], ignore_index=False) # combining datasets
train_data = train_data.sample(frac=1).reset_index(drop=True) # Shuffling training data

input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score"]
X_train = train_data[input_features]
y_train = train_data["delay_bin"]

# Building testing data sets
yes_delay_test = yes_delay[int(len(yes_delay)*0.75):]
no_delay_test = no_delay_sample[int(len(no_delay_sample)*0.75):]
test_data = pd.concat([yes_delay_test, no_delay_test], ignore_index=False) # combining datasets
test_data = test_data.sample(frac=1).reset_index(drop=True) # Shuffling training data

input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score"]
X_test = test_data[input_features]
y_test = test_data["delay_bin"]

# Building the model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Training the model
model.fit(X_train, y_train)

# Checking Overfitting
y_train_pred = model.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_acc:.4f}")

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose())

Training Accuracy: 0.7285
Accuracy: 0.4911
              precision    recall  f1-score      support
0              0.546875  0.959476  0.696668   839.000000
1              0.000000  0.000000  0.000000   100.000000
2              0.101695  0.063830  0.078431    94.000000
3              0.090909  0.022727  0.036364    88.000000
4              0.000000  0.000000  0.000000    97.000000
5              0.000000  0.000000  0.000000    57.000000
6              0.055556  0.026316  0.035714    38.000000
7              0.000000  0.000000  0.000000    42.000000
8              0.000000  0.000000  0.000000    40.000000
9              0.086957  0.054054  0.066667    37.000000
10             0.000000  0.000000  0.000000    28.000000
11             0.000000  0.000000  0.000000    23.000000
12             0.000000  0.000000  0.000000    20.000000
13             0.000000  0.000000  0.000000    14.000000
14             0.000000  0.000000  0.000000    13.000000
15             0.000000  0.000000  0.000000  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
