In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# -----------------------
# Load dataset
# -----------------------
df = pd.read_csv("dataset_part_1.csv")

# -----------------------
# Create "delay" column (binary target)
# -----------------------
df["delay"] = ((df["DEP_DELAY"] > 15) | (df["ARR_DELAY"] > 15)).astype(int)

# -----------------------
# Drop columns that cause data leakage
# -----------------------
X = df.drop(columns=["delay", "DEP_DELAY", "ARR_DELAY"])
y = df["delay"]

# -----------------------
# Encode categorical features
# -----------------------
categorical_cols = X.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    X[col] = X[col].astype(str)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# -----------------------
# Handle missing values
# -----------------------
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
imputer = SimpleImputer(strategy="mean")
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

# -----------------------
# Feature scaling
# -----------------------
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# -----------------------
# Train-test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------
# Feature Selection (Top 40 by correlation with target)
# -----------------------
correlation = X_train.corrwith(y_train).abs().sort_values(ascending=False)
top_features = correlation.index[:40]
X_train = X_train[top_features]
X_test = X_test[top_features]

# -----------------------
# Define Models (GPU-enabled)
# -----------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", tree_method="gpu_hist", random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, task_type="GPU", random_state=42)
}

# -----------------------
# Train & Evaluate Models
# -----------------------
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

# -----------------------
# Ensemble Stacking (GPU-enabled)
# -----------------------
estimators = [
    ("rf", RandomForestClassifier(n_estimators=200, random_state=42)),
    ("xgb", XGBClassifier(eval_metric="logloss", tree_method="gpu_hist", random_state=42)),
    ("cat", CatBoostClassifier(verbose=0, task_type="GPU", random_state=42))
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=500),
    passthrough=True
)

stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)

results["Ensemble Stacking"] = {
    "Accuracy": accuracy_score(y_test, y_pred_stack),
    "Precision": precision_score(y_test, y_pred_stack),
    "Recall": recall_score(y_test, y_pred_stack),
    "F1 Score": f1_score(y_test, y_pred_stack)
}

# -----------------------
# Display Results
# -----------------------
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    


Logistic Regression:
  Accuracy: 0.9922
  Precision: 0.9685
  Recall: 0.9997
  F1 Score: 0.9839

Random Forest:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

XGBoost:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

CatBoost:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

Ensemble Stacking:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
