In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier

# (Optional) CatBoost if installed
try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    CATBOOST_AVAILABLE = False


In [2]:
df = pd.read_csv(r"D:\Big data analysis\input_data.csv")
target = "ARR_DEL15"

In [3]:
leakage_cols = [
    "DEP_DELAY", "DEP_DELAY_NEW", "DEP_DEL15", "WHEELS_OFF", "WHEELS_ON",
    "ARR_DELAY", "ARR_DELAY_NEW", "ARR_TIME", "ACTUAL_ELAPSED_TIME",
    "AIR_TIME", "TAXI_IN", "TAXI_OUT"
]

In [4]:
df = df.drop(columns=[c for c in leakage_cols if c in df.columns], errors="ignore")

In [5]:
df = df.dropna(subset=[target])

In [6]:
X = df.drop(columns=[target])
y = df[target].astype(int)

In [7]:
for col in X.select_dtypes(include=["object", "category"]).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

In [8]:
for col in X.columns:
    if X[col].dtype in [np.float64, np.int64]:
        X[col] = X[col].fillna(X[col].median())
    else:
        X[col] = X[col].fillna(X[col].mode()[0])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
models = {
    "XGBoost (GPU)": XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="gpu_hist",  # uses GPU if available
        random_state=42
    )
}

if CATBOOST_AVAILABLE:
    models["CatBoost (GPU)"] = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        task_type="GPU",  # GPU mode
        verbose=0,
        random_state=42
    )


In [11]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"{name} Results:")
    print(f"  Accuracy : {acc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall   : {recall:.4f}")
    print(f"  F1-score : {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))



Training XGBoost (GPU)...



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGBoost (GPU) Results:
  Accuracy : 0.9972
  Precision: 0.9989
  Recall   : 0.9882
  F1-score : 0.9935

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    854987
           1       1.00      0.99      0.99    234119

    accuracy                           1.00   1089106
   macro avg       1.00      0.99      1.00   1089106
weighted avg       1.00      1.00      1.00   1089106


Training CatBoost (GPU)...
CatBoost (GPU) Results:
  Accuracy : 0.9972
  Precision: 0.9988
  Recall   : 0.9879
  F1-score : 0.9933

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    854987
           1       1.00      0.99      0.99    234119

    accuracy                           1.00   1089106
   macro avg       1.00      0.99      1.00   1089106
weighted avg       1.00      1.00      1.00   1089106

