# 03 – Visuals (Task 3)
Run cells top-to-bottom after training a model with `scripts/train_model.py`.


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split

charts_dir = Path('../outputs/charts'); charts_dir.mkdir(parents=True, exist_ok=True)
df = pd.read_csv('../data/raw/Airline_Delay_Cause.csv')
df = df[['arr_delay','month','carrier','airport']].dropna().copy()
df['arr_delay'] = df['arr_delay'].clip(-60, 360)

# Delay distribution
plt.figure(figsize=(8,5)); df['arr_delay'].hist(bins=60)
plt.xlabel('Minutes'); plt.ylabel('Frequency'); plt.title('Delay Distribution (Minutes)')
plt.tight_layout(); plt.savefig(charts_dir/'delay_distribution.png', dpi=150)

In [None]:
# Average delay by month
plt.figure(figsize=(8,5))
df.groupby('month')['arr_delay'].mean().plot(kind='bar')
plt.ylabel('Minutes'); plt.title('Average Delay by Month')
plt.tight_layout(); plt.savefig(charts_dir/'avg_delay_by_month.png', dpi=150)

In [None]:
# Predicted vs Actual (requires outputs/model/fd_model.pkl)
model_path = Path('../outputs/model/fd_model.pkl')
if model_path.exists():
    pipe = joblib.load(model_path)
    X = df[['month','carrier','airport']].rename(columns={'carrier':'OP_UNIQUE_CARRIER','airport':'ORIGIN','month':'MONTH'})
    y = df['arr_delay'].rename('DEP_DELAY')
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
    preds = pipe.predict(Xte)
    plt.figure(figsize=(6,6)); plt.scatter(yte, preds, s=10, alpha=0.6)
    lims = [min(min(yte), min(preds)), max(max(yte), max(preds))]
    plt.plot(lims, lims, '--'); plt.xlabel('Actual Delay (min)'); plt.ylabel('Predicted Delay (min)')
    plt.title('Predicted vs. Actual Delay'); plt.tight_layout(); plt.savefig(charts_dir/'pred_vs_actual.png', dpi=150)
else:
    print('Model not found. Run scripts/train_model.py first.')

In [None]:
# Feature importance (only if RandomForest is best)
model_path = Path('../outputs/model/fd_model.pkl')
if model_path.exists():
    pipe = joblib.load(model_path)
    if hasattr(pipe.named_steps['model'], 'feature_importances_'):
        from sklearn.compose import ColumnTransformer
        from sklearn.preprocessing import OneHotEncoder
        prep = pipe.named_steps['prep']
        ohe = prep.named_transformers_['cat']
        cat_names = list(ohe.get_feature_names_out(['OP_UNIQUE_CARRIER','ORIGIN']))
        feat_names = ['MONTH'] + cat_names
        importances = pipe.named_steps['model'].feature_importances_
        idx = np.argsort(importances)[-20:]
        plt.figure(figsize=(8,6)); plt.barh(np.array(feat_names)[idx], importances[idx])
        plt.title('Top Feature Importances (RandomForest)'); plt.tight_layout(); plt.savefig(charts_dir/'feature_importance.png', dpi=150)
    else:
        print('Current model does not support feature_importances_.')
else:
    print('Model not found.')