pip install pandas scikit-learn xgboost lightgbm matplotlib seaborn


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
#  1. Load Dataset 
df = pd.read_csv("2015.csv")  # Use a single year for simplicity

In [None]:
#  2. Select Useful Columns 
df = df[[
    'MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',
    'DISTANCE', 'DEP_DELAY', 'ARR_DELAY'
]]

In [None]:
#  3. Target Creation 
df = df.dropna(subset=['DEP_DELAY', 'ARR_DELAY'])  # Drop rows with missing delay
df['Delayed'] = (df['ARR_DELAY'] > 15).astype(int)  # Delayed if arrival delay > 15 mins

In [None]:
#  4. Feature Engineering 
def time_of_day(dep_time):
    if pd.isna(dep_time):
        return np.nan
    hour = int(str(int(dep_time)).zfill(4)[:2])
    if 5 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 17:
        return "Afternoon"
    elif 17 <= hour < 21:
        return "Evening"
    else:
        return "Night"

df['DEP_HOUR_BUCKET'] = df['CRS_DEP_TIME'].apply(time_of_day)

In [None]:
#  5. Define Features 
features = ['MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST',
            'DISTANCE', 'DEP_DELAY', 'DEP_HOUR_BUCKET']
target = 'Delayed'

X = df[features]
y = df[target]

In [None]:
#  6. Preprocessing Pipelines 
categorical = ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_HOUR_BUCKET']
numerical = ['MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'DEP_DELAY']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical),
    ('cat', cat_pipeline, categorical)
])


In [None]:
#  7. Split Data 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

#  8. Define Base Models 
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
lgbm = LGBMClassifier(random_state=42)
log_reg = LogisticRegression()

In [None]:
#  9. Create Final Stacking Classifier 
stacking_model = StackingClassifier(
    estimators=[
        ('rf', rf),
        ('xgb', xgb),
        ('lgbm', lgbm)
    ],
    final_estimator=log_reg,
    passthrough=True
)

#  10. Build Full Pipeline 
full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', stacking_model)
])

#  11. Train Model 
full_pipeline.fit(X_train, y_train)


In [None]:
#  12. Evaluate 
y_pred = full_pipeline.predict(X_test)
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()