In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# (Optional) CatBoost if installed
try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    CATBOOST_AVAILABLE = False

# ===== Load Data =====
df = pd.read_csv(r"D:\Big data analysis\splited dataset\dataset_part_10.csv")
target = "ARR_DEL15"

# Drop leakage columns if present
leakage_cols = [
    "DEP_DELAY", "DEP_DELAY_NEW", "DEP_DEL15", "WHEELS_OFF", "WHEELS_ON",
    "ARR_DELAY", "ARR_DELAY_NEW", "ARR_TIME", "ACTUAL_ELAPSED_TIME",
    "AIR_TIME", "TAXI_IN", "TAXI_OUT"
]
df = df.drop(columns=[c for c in leakage_cols if c in df.columns], errors="ignore")

# Drop rows with missing target
df = df.dropna(subset=[target])

# ===== Split X / y =====
X = df.drop(columns=[target])
y = df[target].astype(int)

# Encode categorical columns
for col in X.select_dtypes(include=["object", "category"]).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# ==== Handle missing values ====
# Fill numeric NaN with median, categorical with mode
for col in X.columns:
    if X[col].dtype in [np.float64, np.int64]:
        X[col] = X[col].fillna(X[col].median())
    else:
        X[col] = X[col].fillna(X[col].mode()[0])

# ===== Train/Test split =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===== Models =====
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost (GPU)": XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="gpu_hist",  # uses GPU if available
        random_state=42
    )
}

if CATBOOST_AVAILABLE:
    models["CatBoost (GPU)"] = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        task_type="GPU",  # GPU mode
        verbose=0,
        random_state=42
    )

# ===== Train & Evaluate =====
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")



Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.8028

Training Random Forest...
Random Forest Accuracy: 0.9966

Training XGBoost (GPU)...



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


XGBoost (GPU) Accuracy: 0.9967

Training CatBoost (GPU)...
CatBoost (GPU) Accuracy: 0.9967
