In [None]:
import pandas as pd
import numpy as np
import subprocess
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Global configs
RANDOM_STATE = 42
N_JOBS = -1

# ----------------------------
# 0) Detect GPU availability
# ----------------------------
def has_gpu():
    try:
        subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        print("✅ GPU detected! Using GPU acceleration for XGBoost & CatBoost.")
        return True
    except Exception:
        print("⚠️ No GPU detected. Running on CPU.")
        return False

GPU_AVAILABLE = has_gpu()

# ----------------------------
# 1) Load dataset
# ----------------------------
df = pd.read_csv(r"D:\flight delay\flight_data_2018_2024.csv\flight_data_2018_2024.csv")
print(f'Loaded data with shape: {df.shape}')

# ----------------------------
# 2) Target selection
# ----------------------------
def choose_target(df):
    for col in ['DepDelay', 'ArrDelay', 'Delay', 'DelayMinutes']:
        if col in df.columns:
            print(f"Using target column: {col}")
            return col
    delay_cols = [c for c in df.columns if 'delay' in c.lower() and pd.api.types.is_numeric_dtype(df[c])]
    if delay_cols:
        print(f"Using detected numeric delay column: {delay_cols[0]}")
        return delay_cols[0]
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        raise ValueError('No numeric columns found to use as a regression target.')
    print(f"No explicit delay column found. Falling back to first numeric column: {numeric_cols[0]}")
    return numeric_cols[0]

# ----------------------------
# 3) Preprocessing
# ----------------------------
def preprocess(df, target_col, keep_cols=None):
    df = df[~df[target_col].isna()].reset_index(drop=True)
    if keep_cols is None:
        keep_cols = ['CancellationCode', 'Org_Airport', 'Dest_Airport']
    keep_cols = [c for c in keep_cols if c in df.columns]
    X = df.drop(columns=[target_col])
    y = df[target_col].astype(float)
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    for c in keep_cols:
        if c in X.columns and c not in cat_cols:
            cat_cols.append(c)
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    print(f'Categorical columns to label-encode: {cat_cols}')
    print(f'Numeric columns to standardize: {num_cols}')
    return X, y, cat_cols, num_cols

def fit_label_encoders(X_train, X_valid, cat_cols):
    for col in cat_cols:
        le = LabelEncoder()
        X_train[col] = X_train[col].fillna('___MISSING___').astype(str)
        X_valid[col] = X_valid[col].fillna('___MISSING___').astype(str)
        le.fit(X_train[col])
        X_train[col] = le.transform(X_train[col])
        valid_mapped = []
        classes = set(le.classes_)
        unk_code = len(le.classes_)
        for v in X_valid[col]:
            if v in classes:
                valid_mapped.append(int(np.where(le.classes_ == v)[0][0]))
            else:
                valid_mapped.append(unk_code)
        X_valid[col] = valid_mapped
    return X_train, X_valid

def standardize_numeric(X_train, X_valid, num_cols):
    scaler = StandardScaler()
    if num_cols:
        X_train[num_cols] = scaler.fit_transform(X_train[num_cols].fillna(0))
        X_valid[num_cols] = scaler.transform(X_valid[num_cols].fillna(0))
    return X_train, X_valid

# ----------------------------
# 4) Parameter grids
# ----------------------------
def get_param_distributions():
    xgb_dist = {
        'n_estimators': [100, 200, 400],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.6, 0.8, 1.0]
    }
    cat_dist = {
        'iterations': [200, 400],
        'depth': [4, 6, 10],
        'learning_rate': [0.01, 0.05, 0.1]
    }
    return xgb_dist, cat_dist

# ----------------------------
# 5) Tuning function
# ----------------------------
def tune_model(estimator, param_distributions, X, y, n_iter=8, cv=3):
    cv = KFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    rs = RandomizedSearchCV(
        estimator,
        param_distributions,
        n_iter=n_iter,
        scoring='neg_root_mean_squared_error',
        n_jobs=N_JOBS,
        cv=cv,
        random_state=RANDOM_STATE,
        verbose=1
    )
    rs.fit(X, y)
    print(f'Best score: {rs.best_score_} | Best params: {rs.best_params_}')
    return rs.best_estimator_

# ----------------------------
# 6) Main pipeline
# ----------------------------
target_col = choose_target(df)
X, y, cat_cols, num_cols = preprocess(df, target_col)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train, X_test = fit_label_encoders(X_train.copy(), X_test.copy(), cat_cols)
X_train, X_test = standardize_numeric(X_train, X_test, num_cols)

xgb_dist, cat_dist = get_param_distributions()

# XGBoost (GPU if available)
xgb_best = tune_model(
    XGBRegressor(
        random_state=RANDOM_STATE,
        n_jobs=1,
        objective='reg:squarederror',
        verbosity=0,
        tree_method='gpu_hist' if GPU_AVAILABLE else 'hist',
        predictor='gpu_predictor' if GPU_AVAILABLE else 'auto'
    ),
    xgb_dist, X_train, y_train, n_iter=8
)

# CatBoost (GPU if available)
cat_best = tune_model(
    CatBoostRegressor(
        random_state=RANDOM_STATE,
        verbose=0,
        task_type='GPU' if GPU_AVAILABLE else 'CPU',
        devices='0' if GPU_AVAILABLE else None
    ),
    cat_dist, X_train, y_train, n_iter=8
)

# ----------------------------
# 7) Stacking Ensemble
# ----------------------------
stack = StackingRegressor(
    estimators=[('xgb', xgb_best), ('cat', cat_best)],
    final_estimator=RidgeCV(),
    n_jobs=N_JOBS
)

print('\nTraining stacking regressor...')
stack.fit(X_train, y_train)

preds = stack.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

print('\nTest set evaluation:')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R2: {r2:.4f}')


✅ GPU detected! Using GPU acceleration for XGBoost & CatBoost.


  df = pd.read_csv(r"D:\flight delay\flight_data_2018_2024.csv\flight_data_2018_2024.csv")


Loaded data with shape: (582425, 120)
Using target column: DepDelay
Categorical columns to label-encode: ['FlightDate', 'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners', 'IATA_Code_Marketing_Airline', 'Originally_Scheduled_Code_Share_Airline', 'IATA_Code_Originally_Scheduled_Code_Share_Airline', 'Operating_Airline ', 'IATA_Code_Operating_Airline', 'Tail_Number', 'Origin', 'OriginCityName', 'OriginState', 'OriginStateName', 'Dest', 'DestCityName', 'DestState', 'DestStateName', 'DepTimeBlk', 'ArrTimeBlk', 'CancellationCode', 'Div1Airport', 'Div1TailNum', 'Div2Airport', 'Div2TailNum', 'Div3Airport', 'Div3TailNum', 'Duplicate']
Numeric columns to standardize: ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DOT_ID_Marketing_Airline', 'Flight_Number_Marketing_Airline', 'DOT_ID_Originally_Scheduled_Code_Share_Airline', 'Flight_Num_Originally_Scheduled_Code_Share_Airline', 'DOT_ID_Operating_Airline', 'Flight_Number_Operating_Airline', 'OriginAirportID', 'OriginAi

In [1]:
import pandas as pd
import numpy as np
import subprocess
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
RANDOM_STATE = 42
N_JOBS = -1

# ----------------------------
# 0) Detect GPU availability
# ----------------------------
def has_gpu():
    try:
        subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        print("✅ GPU detected! Using GPU acceleration for XGBoost & CatBoost.")
        return True
    except Exception:
        print("⚠️ No GPU detected. Running on CPU.")
        return False

GPU_AVAILABLE = has_gpu()

✅ GPU detected! Using GPU acceleration for XGBoost & CatBoost.


In [3]:
df = pd.read_csv(r"D:\flight delay\flight_data_2018_2024.csv\flight_data_2018_2024.csv")
print(f'Loaded data with shape: {df.shape}')


  df = pd.read_csv(r"D:\flight delay\flight_data_2018_2024.csv\flight_data_2018_2024.csv")


Loaded data with shape: (582425, 120)


In [4]:
def choose_target(df):
    for col in ['DepDelay', 'ArrDelay', 'Delay', 'DelayMinutes']:
        if col in df.columns:
            print(f"Using target column: {col}")
            return col
    delay_cols = [c for c in df.columns if 'delay' in c.lower() and pd.api.types.is_numeric_dtype(df[c])]
    if delay_cols:
        print(f"Using detected numeric delay column: {delay_cols[0]}")
        return delay_cols[0]
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        raise ValueError('No numeric columns found to use as a regression target.')
    print(f"No explicit delay column found. Falling back to first numeric column: {numeric_cols[0]}")
    return numeric_cols[0]

In [5]:
def preprocess(df, target_col, keep_cols=None):
    df = df[~df[target_col].isna()].reset_index(drop=True)
    if keep_cols is None:
        keep_cols = ['CancellationCode', 'Org_Airport', 'Dest_Airport']
    keep_cols = [c for c in keep_cols if c in df.columns]
    X = df.drop(columns=[target_col])
    y = df[target_col].astype(float)
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    for c in keep_cols:
        if c in X.columns and c not in cat_cols:
            cat_cols.append(c)
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    print(f'Categorical columns to label-encode: {cat_cols}')
    print(f'Numeric columns to standardize: {num_cols}')
    return X, y, cat_cols, num_cols


In [6]:
def fit_label_encoders(X_train, X_valid, cat_cols):
    for col in cat_cols:
        le = LabelEncoder()
        X_train[col] = X_train[col].fillna('___MISSING___').astype(str)
        X_valid[col] = X_valid[col].fillna('___MISSING___').astype(str)
        le.fit(X_train[col])
        X_train[col] = le.transform(X_train[col])
        valid_mapped = []
        classes = set(le.classes_)
        unk_code = len(le.classes_)
        for v in X_valid[col]:
            if v in classes:
                valid_mapped.append(int(np.where(le.classes_ == v)[0][0]))
            else:
                valid_mapped.append(unk_code)
        X_valid[col] = valid_mapped
    return X_train, X_valid

In [7]:
def standardize_numeric(X_train, X_valid, num_cols):
    scaler = StandardScaler()
    if num_cols:
        X_train[num_cols] = scaler.fit_transform(X_train[num_cols].fillna(0))
        X_valid[num_cols] = scaler.transform(X_valid[num_cols].fillna(0))
    return X_train, X_valid

In [8]:
def get_param_distributions():
    xgb_dist = {
        'n_estimators': [100, 200, 400],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.6, 0.8, 1.0]
    }
    cat_dist = {
        'iterations': [200, 400],
        'depth': [4, 6, 10],
        'learning_rate': [0.01, 0.05, 0.1]
    }
    return xgb_dist, cat_dist

In [9]:
def tune_model(estimator, param_distributions, X, y, n_iter=8, cv=3):
    cv = KFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    rs = RandomizedSearchCV(
        estimator,
        param_distributions,
        n_iter=n_iter,
        scoring='neg_root_mean_squared_error',
        n_jobs=N_JOBS,
        cv=cv,
        random_state=RANDOM_STATE,
        verbose=1
    )
    rs.fit(X, y)
    print(f'Best score: {rs.best_score_} | Best params: {rs.best_params_}')
    return rs.best_estimator_

In [10]:
target_col = choose_target(df)
X, y, cat_cols, num_cols = preprocess(df, target_col)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train, X_test = fit_label_encoders(X_train.copy(), X_test.copy(), cat_cols)
X_train, X_test = standardize_numeric(X_train, X_test, num_cols)

xgb_dist, cat_dist = get_param_distributions()

Using target column: DepDelay
Categorical columns to label-encode: ['FlightDate', 'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners', 'IATA_Code_Marketing_Airline', 'Originally_Scheduled_Code_Share_Airline', 'IATA_Code_Originally_Scheduled_Code_Share_Airline', 'Operating_Airline ', 'IATA_Code_Operating_Airline', 'Tail_Number', 'Origin', 'OriginCityName', 'OriginState', 'OriginStateName', 'Dest', 'DestCityName', 'DestState', 'DestStateName', 'DepTimeBlk', 'ArrTimeBlk', 'CancellationCode', 'Div1Airport', 'Div1TailNum', 'Div2Airport', 'Div2TailNum', 'Div3Airport', 'Div3TailNum', 'Duplicate']
Numeric columns to standardize: ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DOT_ID_Marketing_Airline', 'Flight_Number_Marketing_Airline', 'DOT_ID_Originally_Scheduled_Code_Share_Airline', 'Flight_Num_Originally_Scheduled_Code_Share_Airline', 'DOT_ID_Operating_Airline', 'Flight_Number_Operating_Airline', 'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'Or

In [11]:
xgb_best = tune_model(
    XGBRegressor(
        random_state=RANDOM_STATE,
        n_jobs=1,
        objective='reg:squarederror',
        verbosity=0,
        tree_method='gpu_hist' if GPU_AVAILABLE else 'hist',
        predictor='gpu_predictor' if GPU_AVAILABLE else 'auto'
    ),
    xgb_dist, X_train, y_train, n_iter=8
)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


2 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\xgboost\sklearn.py", line 1222, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\Ap

Best score: -7.586179577742712 | Best params: {'subsample': 0.8, 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1}


In [None]:
cat_best = tune_model(
    CatBoostRegressor(
        random_state=RANDOM_STATE,
        verbose=0,
        task_type='GPU' if GPU_AVAILABLE else 'CPU',
        devices='0' if GPU_AVAILABLE else None
    ),
    cat_dist, X_train, y_train, n_iter=8
)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("flight_data_2018_2024.csv", low_memory=False)

# Drop unused columns
df = df.drop(columns=["Unnamed: 119"], errors="ignore")

# Remove rows with missing target
df = df.dropna(subset=["ArrDel15"])

# Separate features and target
y = df["ArrDel15"]
X = df.drop(columns=["ArrDel15"])

# Fill missing values
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = X[col].fillna("Unknown")
    else:
        X[col] = X[col].fillna(-1)

# Encode categorical columns
label_encoders = {}
for col in X.columns:
    if X[col].dtype == "object":
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Example prediction with first row
sample = X.iloc[[0]]
print("Predicted delay for first flight:", model.predict(sample))


Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     84878
         1.0       1.00      1.00      1.00     26865

    accuracy                           1.00    111743
   macro avg       1.00      1.00      1.00    111743
weighted avg       1.00      1.00      1.00    111743

Predicted delay for first flight: [1.]


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("flight_data_2018_2024.csv", low_memory=False)

# Drop unused columns
df = df.drop(columns=["Unnamed: 119"], errors="ignore")

# Remove rows with missing target
df = df.dropna(subset=["ArrDel15"])

# Separate features and target
y = df["ArrDel15"]
X = df.drop(columns=["ArrDel15"])

# Fill missing values
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = X[col].fillna("Unknown")
    else:
        X[col] = X[col].fillna(-1)

# Encode categorical columns
label_encoders = {}
for col in X.columns:
    if X[col].dtype == "object":
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to compare
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))



=== Random Forest ===
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     84878
         1.0       1.00      1.00      1.00     26865

    accuracy                           1.00    111743
   macro avg       1.00      1.00      1.00    111743
weighted avg       1.00      1.00      1.00    111743


=== Logistic Regression ===
Accuracy: 0.944569234761909
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96     84878
         1.0       0.96      0.80      0.87     26865

    accuracy                           0.94    111743
   macro avg       0.95      0.90      0.92    111743
weighted avg       0.95      0.94      0.94    111743


=== XGBoost ===
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     84878
         1.0       1.00      1.00      1.00     26865

    accuracy                           1.00    111743
   