In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

# Reading data
data = pd.read_csv("merged_data.csv")

In [None]:
# Predict delay bin including 0 delay

# Loading Inputs
input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score"]
X = data[input_features]

# Loading Outputs (5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, inf)
y = data["delay_bin"]

# Splitting data into training and testing sets
X_train, X_test = X[:int(len(data)*0.75)], X[int(len(data)*0.75):]
y_train, y_test = y[:int(len(data)*0.75)], y[int(len(data)*0.75):]

# Building the model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    use_label_encoder=False,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9905
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     86275
           1       0.00      0.00      0.00        99
           2       0.00      0.00      0.00        93
           3       0.00      0.00      0.00        87
           4       0.00      0.00      0.00        96
           5       0.00      0.00      0.00        56
           6       0.00      0.00      0.00        36
           7       0.00      0.00      0.00        42
           8       0.00      0.00      0.00        39
           9       0.00      0.00      0.00        37
          10       0.00      0.00      0.00        27
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00        20
          13       0.00      0.00      0.00        13
          14       0.00      0.00      0.00        13
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00        11
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# Predict delay bin including 0 delay, and using class weights

# Loading Inputs
input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score"]
X = data[input_features]

# Loading Outputs (5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, inf)
y = data["delay_bin"]

# Splitting data into training and testing sets
X_train, X_test = X[:int(len(data)*0.75)], X[int(len(data)*0.75):]
y_train, y_test = y[:int(len(data)*0.75)], y[int(len(data)*0.75):]

# Building Class weights
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)
weight_dict = dict(zip(classes, class_weights))

# Showing weights
print(weight_dict)

# Building the model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    use_label_encoder=False,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Mapping each training sample to its class weight
sample_weights = y_train.map(weight_dict)

# Training the model
model.fit(X_train, y_train, sample_weight=sample_weights)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))

{np.int64(0): np.float64(0.04039060960293692), np.int64(1): np.float64(26.06503740648379), np.int64(2): np.float64(33.93532467532467), np.int64(3): np.float64(42.83639344262295), np.int64(4): np.float64(46.45368888888889), np.int64(5): np.float64(67.43277419354838), np.int64(6): np.float64(69.21907284768211), np.int64(7): np.float64(91.68491228070175), np.int64(8): np.float64(101.47650485436893), np.int64(9): np.float64(129.03802469135803), np.int64(10): np.float64(132.30481012658228), np.int64(11): np.float64(160.80123076923076), np.int64(12): np.float64(186.6442857142857), np.int64(13): np.float64(204.9427450980392), np.int64(14): np.float64(282.4886486486486), np.int64(15): np.float64(337.16387096774196), np.int64(16): np.float64(261.302), np.int64(17): np.float64(418.0832), np.int64(18): np.float64(497.71809523809526), np.int64(19): np.float64(418.0832), np.int64(20): np.float64(475.0945454545455), np.int64(21): np.float64(435.50333333333333), np.int64(22): np.float64(653.255), np.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6665
              precision    recall  f1-score   support

           0       0.99      0.67      0.80     86275
           1       0.00      0.06      0.00        99
           2       0.00      0.11      0.01        93
           3       0.00      0.10      0.01        87
           4       0.00      0.05      0.00        96
           5       0.00      0.00      0.00        56
           6       0.00      0.03      0.00        36
           7       0.00      0.00      0.00        42
           8       0.00      0.00      0.00        39
           9       0.00      0.05      0.01        37
          10       0.00      0.00      0.00        27
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00        20
          13       0.00      0.00      0.00        13
          14       0.00      0.08      0.01        13
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00        11
          

In [None]:
# Predict delay bin including 0 delay, and balancing 0 delay data to delay data size

# Extract 0 delay data, and above 0 delay data
no_delay = data[not data["yes_delay"]]
yes_delay = data[data["yes_delay"]]

# Sampling 0 delay data (THERE IS POTENTIAL DATA LEAKAGE)
no_delay_sample = no_delay.sample(n=len(yes_delay))

# Building training data sets
yes_delay_train = yes_delay[:int(len(yes_delay)*0.75)]
no_delay_train = no_delay_sample[:int(len(no_delay_sample)*0.75)]
train_data = pd.concat([yes_delay_train, no_delay_train], ignor_index=False) # combining datasets
train_data = train_data.sample(frac=1).reset_index(drop=True) # Shuffling training data


# Loading Inputs
input_features = ["wind_dir", "wind_speed", "ceiling", "visibility", "temp", "dew_pnt", "pressure", "congestion_score"]
X = data[input_features]

# Loading Outputs (5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, inf)
y = data["delay_bin"]

# Splitting data into training and testing sets
X_train, X_test = X[:int(len(data)*0.75)], X[int(len(data)*0.75):]
y_train, y_test = y[:int(len(data)*0.75)], y[int(len(data)*0.75):]

# Building the model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,              # fraction of samples per tree
    colsample_bytree=0.8,       # fraction of features per tree
    num_class=25,
    use_label_encoder=False,
    eval_metric='mlogloss',
    objective='multi:softmax'
)

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))