# Flight Delay Prediction (Kaggle: January Flight Delay)

**Goal:** Predict whether a flight will be delayed by more than 15 minutes using the January Flight Delay dataset.

**Dataset link:** https://www.kaggle.com/datasets/divyansh22/flight-delay-prediction?select=Jan_2020_ontime.csv

**Dataset columns:** `DAY_OF_MONTH`,`DAY_OF_WEEK`,`OP_UNIQUE_CARRIER`,`OP_CARRIER_AIRLINE_ID`,`OP_CARRIER`,`TAIL_NUM`,`OP_CARRIER_FL_NUM`,\
`ORIGIN_AIRPORT_ID`,`ORIGIN_AIRPORT_SEQ_ID`,`ORIGIN`,`DEST_AIRPORT_ID`,`DEST_AIRPORT_SEQ_ID`,`DEST`,`DEP_TIME`,`DEP_DEL15`,`DEP_TIME_BLK`,\
`ARR_TIME`,`ARR_DEL15`,`CANCELLED`,`DIVERTED`,`DISTANCE`.

**Target definition:** `Delayed = 1` if `DepDelay > 15`, else `0`.

In [110]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Tuple, List

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt


In [111]:
DATA_PATH = "/content/drive/MyDrive/Notes & Practice/Datasets/Jan_2020_ontime.csv"
Path(DATA_PATH)
df = pd.read_csv(DATA_PATH)

In [112]:
print('Columns:', list(df.columns))
print('\nMissing values per column:\n', df.isna().sum())
print('\nBasic stats:\n', df.describe(include='all').T.head(20))

Columns: ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE', 'Unnamed: 21']

Missing values per column:
 DAY_OF_MONTH                  0
DAY_OF_WEEK                   0
OP_UNIQUE_CARRIER             0
OP_CARRIER_AIRLINE_ID         0
OP_CARRIER                    0
TAIL_NUM                    698
OP_CARRIER_FL_NUM             0
ORIGIN_AIRPORT_ID             0
ORIGIN_AIRPORT_SEQ_ID         0
ORIGIN                        0
DEST_AIRPORT_ID               0
DEST_AIRPORT_SEQ_ID           0
DEST                          0
DEP_TIME                   6664
DEP_DEL15                  6699
DEP_TIME_BLK                  0
ARR_TIME                   7075
ARR_DEL15                  8078
CANCELLED                     0
DIVER

In [113]:
print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head()

Shape: (607346, 22)
Columns: ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE', 'Unnamed: 21']


Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,...,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE,Unnamed: 21
0,1,3,EV,20366,EV,N48901,4397,13930,1393007,ORD,...,GRB,1003.0,0.0,1000-1059,1117.0,0.0,0.0,0.0,174.0,
1,1,3,EV,20366,EV,N16976,4401,15370,1537002,TUL,...,ORD,1027.0,0.0,1000-1059,1216.0,0.0,0.0,0.0,585.0,
2,1,3,EV,20366,EV,N12167,4404,11618,1161802,EWR,...,TYS,1848.0,0.0,1800-1859,2120.0,0.0,0.0,0.0,631.0,
3,1,3,EV,20366,EV,N14902,4405,10781,1078105,BTR,...,IAH,1846.0,0.0,1800-1859,2004.0,0.0,0.0,0.0,253.0,
4,1,3,EV,20366,EV,N606UX,4407,14524,1452401,RIC,...,IAH,1038.0,0.0,1000-1059,1330.0,0.0,0.0,0.0,1157.0,


In [114]:
# Removing diverted or cancelled flights to get only delayed flights whether arrival or departure delay

df = df[(df['CANCELLED'] == 0) & (df['DIVERTED'] == 0)]
print("Remaining rows:", df.shape[0])
print(set(df['ARR_DEL15']))
print(set(df['DEP_DEL15']))

Remaining rows: 599268
{0.0, 1.0}
{0.0, 1.0}


In [115]:
print('Columns:', list(df.columns))
print('\nMissing values per column:\n', df.isna().sum())
print('\nBasic stats:\n', df.describe(include='all').T.head(20))

Columns: ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE', 'Unnamed: 21']

Missing values per column:
 DAY_OF_MONTH                  0
DAY_OF_WEEK                   0
OP_UNIQUE_CARRIER             0
OP_CARRIER_AIRLINE_ID         0
OP_CARRIER                    0
TAIL_NUM                      0
OP_CARRIER_FL_NUM             0
ORIGIN_AIRPORT_ID             0
ORIGIN_AIRPORT_SEQ_ID         0
ORIGIN                        0
DEST_AIRPORT_ID               0
DEST_AIRPORT_SEQ_ID           0
DEST                          0
DEP_TIME                      0
DEP_DEL15                     0
DEP_TIME_BLK                  0
ARR_TIME                      0
ARR_DEL15                     0
CANCELLED                     0
DIVER

In [116]:
NUMERIC = ['DAY_OF_MONTH','DAY_OF_WEEK','DISTANCE','DEP_TIME','ARR_TIME']
CATEG   = ['OP_UNIQUE_CARRIER','ORIGIN','DEST','DEP_TIME_BLK']
X = df[NUMERIC + CATEG].copy()

# Building delay prediction for both arrivals and departures
y_arr = df['ARR_DEL15'].astype(int)
y_dep = df['DEP_DEL15'].astype(int)

In [117]:
import numpy as np

def hhmm_to_minutes_safe(v):
    if pd.isna(v):
        return np.nan
    try:
        v = int(v)
    except Exception:
        try:
            v = int(float(v))
        except Exception:
            return np.nan
    hh, mm = v // 100, v % 100
    if not (0 <= hh <= 23 and 0 <= mm <= 59):
        return np.nan
    return hh * 60 + mm


for col in ["DEP_TIME", "ARR_TIME"]:
    if col in X.columns:
        X[col] = X[col].apply(hhmm_to_minutes_safe)


In [118]:
def encode_cyclical_time(df, col, max_val=1440):
    if col not in df.columns:
        return
    df[col + "_sin"] = np.sin(2 * np.pi * df[col] / max_val)
    df[col + "_cos"] = np.cos(2 * np.pi * df[col] / max_val)
    df.drop(columns=[col], inplace=True)

encode_cyclical_time(X, "DEP_TIME")
encode_cyclical_time(X, "ARR_TIME")

# Rebuild the feature lists to match the new columns
NUMERIC = [c for c in [
    'DAY_OF_MONTH','DAY_OF_WEEK','DISTANCE',
    'DEP_TIME_sin','DEP_TIME_cos','ARR_TIME_sin','ARR_TIME_cos'
] if c in X.columns]

CATEG   = [c for c in ['OP_UNIQUE_CARRIER','ORIGIN','DEST','DEP_TIME_BLK'] if c in X.columns]

print("Numeric:", NUMERIC)
print("Categorical:", CATEG)
print("X shape:", X.shape)

Numeric: ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'DEP_TIME_sin', 'DEP_TIME_cos', 'ARR_TIME_sin', 'ARR_TIME_cos']
Categorical: ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_TIME_BLK']
X shape: (599268, 11)


In [119]:
# Train test split for arrival delays
X_train, X_val, y_arr_train, y_arr_val, y_dep_train, y_dep_val = train_test_split(
    X, y_arr, y_dep,
    test_size=0.2, random_state=42, stratify=y_arr
)


print("Train shape:", X_train.shape, "Val shape:", X_val.shape)

Train shape: (479414, 11) Val shape: (119854, 11)


In [120]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, NUMERIC),
        ('cat', categorical_transformer, CATEG)
    ]
)

In [121]:
# Build pipeline once
log_reg_arr = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', LogisticRegression(max_iter=1000, n_jobs=-1, class_weight='balanced'))
])

log_reg_dep = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', LogisticRegression(max_iter=1000, n_jobs=-1, class_weight='balanced'))
])

# Fit
log_reg_arr.fit(X_train, y_arr_train)
log_reg_dep.fit(X_train, y_dep_train)

# Predict
y_arr_pred = log_reg_arr.predict(X_val)
y_arr_prob = log_reg_arr.predict_proba(X_val)[:, 1]

y_dep_pred = log_reg_dep.predict(X_val)
y_dep_prob = log_reg_dep.predict_proba(X_val)[:, 1]

# Results
print("=== ARRIVAL DELAY (ARR_DEL15) ===")
print(classification_report(y_arr_val, y_arr_pred, digits=3, zero_division=0))
print("ROC AUC:", roc_auc_score(y_arr_val, y_arr_prob))

print("\n=== DEPARTURE DELAY (DEP_DEL15) ===")
print(classification_report(y_dep_val, y_dep_pred, digits=3, zero_division=0))
print("ROC AUC:", roc_auc_score(y_dep_val, y_dep_prob))


=== ARRIVAL DELAY (ARR_DEL15) ===
              precision    recall  f1-score   support

           0      0.928     0.718     0.810    103397
           1      0.268     0.648     0.379     16457

    accuracy                          0.709    119854
   macro avg      0.598     0.683     0.594    119854
weighted avg      0.837     0.709     0.751    119854

ROC AUC: 0.7432242720143978

=== DEPARTURE DELAY (DEP_DEL15) ===
              precision    recall  f1-score   support

           0      0.938     0.745     0.831    103557
           1      0.298     0.689     0.417     16297

    accuracy                          0.737    119854
   macro avg      0.618     0.717     0.624    119854
weighted avg      0.851     0.737     0.774    119854

ROC AUC: 0.7760476225629507


In [122]:
# Random Forest for ARR
rf_arr = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_leaf=5,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
    class_weight='balanced'))
])
rf_arr.fit(X_train, y_arr_train)
arr_rf_prob = rf_arr.predict_proba(X_val)[:, 1]
arr_rf_pred = (arr_rf_prob >= 0.5).astype(int)

print("=== Random Forest: ARRIVAL DELAY ===")
print(classification_report(y_arr_val, arr_rf_pred, digits=3, zero_division=0))
print("ROC AUC:", roc_auc_score(y_arr_val, arr_rf_prob))

# Random Forest for DEP
rf_dep = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', RandomForestClassifier(n_estimators=200,
    max_depth=20,
    min_samples_leaf=5,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
    class_weight='balanced'))
])
rf_dep.fit(X_train, y_dep_train)
dep_rf_prob = rf_dep.predict_proba(X_val)[:, 1]
dep_rf_pred = (dep_rf_prob >= 0.5).astype(int)

print("\n=== Random Forest: DEPARTURE DELAY ===")
print(classification_report(y_dep_val, dep_rf_pred, digits=3, zero_division=0))
print("ROC AUC:", roc_auc_score(y_dep_val, dep_rf_prob))


=== Random Forest: ARRIVAL DELAY ===
              precision    recall  f1-score   support

           0      0.947     0.778     0.854    103397
           1      0.342     0.724     0.465     16457

    accuracy                          0.771    119854
   macro avg      0.644     0.751     0.659    119854
weighted avg      0.864     0.771     0.801    119854

ROC AUC: 0.829627944039631

=== Random Forest: DEPARTURE DELAY ===
              precision    recall  f1-score   support

           0      0.960     0.758     0.847    103557
           1      0.342     0.799     0.479     16297

    accuracy                          0.763    119854
   macro avg      0.651     0.778     0.663    119854
weighted avg      0.876     0.763     0.797    119854

ROC AUC: 0.8603869166767473


In [123]:
neg, pos = np.bincount(y_arr_train)
scale_pos_weight_arr = neg / pos
print("Arrival delay scale_pos_weight:", scale_pos_weight_arr)

Arrival delay scale_pos_weight: 6.282827975937291


In [124]:
neg, pos = np.bincount(y_dep_train)
scale_pos_weight_dep = neg / pos
print("Departure delay scale_pos_weight:", scale_pos_weight_dep)


Departure delay scale_pos_weight: 6.358618572524942


In [125]:
from xgboost import XGBClassifier

# XGBoost for ARR
xgb_arr = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', XGBClassifier(
        n_estimators=400, learning_rate=0.08, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight_arr
    ))
])
xgb_arr.fit(X_train, y_arr_train)
arr_xgb_prob = xgb_arr.predict_proba(X_val)[:, 1]
arr_xgb_pred = (arr_xgb_prob >= 0.5).astype(int)

print("=== XGBoost: ARRIVAL DELAY ===")
print(classification_report(y_arr_val, arr_xgb_pred, digits=3, zero_division=0))
print("ROC AUC:", roc_auc_score(y_arr_val, arr_xgb_prob))

# XGBoost for DEP
xgb_dep = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', XGBClassifier(
        n_estimators=400, learning_rate=0.08, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight_dep
    ))
])
xgb_dep.fit(X_train, y_dep_train)
dep_xgb_prob = xgb_dep.predict_proba(X_val)[:, 1]
dep_xgb_pred = (dep_xgb_prob >= 0.5).astype(int)

print("\n=== XGBoost: DEPARTURE DELAY ===")
print(classification_report(y_dep_val, dep_xgb_pred, digits=3, zero_division=0))
print("ROC AUC:", roc_auc_score(y_dep_val, dep_xgb_prob))


=== XGBoost: ARRIVAL DELAY ===
              precision    recall  f1-score   support

           0      0.962     0.907     0.934    103397
           1      0.571     0.775     0.658     16457

    accuracy                          0.889    119854
   macro avg      0.767     0.841     0.796    119854
weighted avg      0.908     0.889     0.896    119854

ROC AUC: 0.9224476718848502

=== XGBoost: DEPARTURE DELAY ===
              precision    recall  f1-score   support

           0      0.965     0.929     0.947    103557
           1      0.637     0.788     0.704     16297

    accuracy                          0.910    119854
   macro avg      0.801     0.859     0.826    119854
weighted avg      0.921     0.910     0.914    119854

ROC AUC: 0.9433758863068713
