# 01 – Quick Check (BTS Flight Delays) – v2

This version is more robust and will help you map column names if your CSV uses slightly different headers.

## What this notebook does
1. Loads one month of BTS On-Time Performance data
2. Detects/validates the needed columns with fallbacks
3. Fits a simple Multiple Linear Regression
4. Prints **R²** and **MSE**
5. Saves two quick charts to `../outputs`


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
DATA_PATH = Path("../data/raw/On_Time_Reporting_2023_01.csv")
OUTPUT_DIR = Path("../outputs")
OUTPUT_DIR.mkdir(exist_ok=True)
print(DATA_PATH.resolve())


In [None]:
df = pd.read_csv(DATA_PATH, low_memory=False)
print('Shape:', df.shape)
print('Sample columns:', list(df.columns[:30]))
df.head(3)

## Column mapping helper
We need these logical fields:
- `DEP_DELAY` (departure delay minutes)
- `MONTH` (1–12)
- `OP_UNIQUE_CARRIER` (carrier code) – fallbacks: `OP_CARRIER`, `CARRIER`
- `ORIGIN` (origin airport code) – fallbacks: `ORIGIN_AIRPORT`, `ORIGIN_AIRPORT_ID`


In [None]:
def choose_col(df, options, required=True):
    for c in options:
        if c in df.columns:
            print(f"Using column: {c}")
            return c
    if required:
        raise KeyError(f"None of the columns {options} were found. Adjust the list to match your CSV headers.")
    return None
col_dep_delay = choose_col(df, ["DEP_DELAY", "DEP_DELAY_NEW", "DEPARTURE_DELAY"], required=True)
col_month     = choose_col(df, ["MONTH"], required=True)
col_carrier   = choose_col(df, ["OP_UNIQUE_CARRIER", "OP_CARRIER", "CARRIER"], required=True)
col_origin    = choose_col(df, ["ORIGIN", "ORIGIN_AIRPORT", "ORIGIN_AIRPORT_ID"], required=True)


In [None]:
cols = [col_dep_delay, col_carrier, col_origin, col_month]
df_small = df[cols].copy()
df_small.columns = ["DEP_DELAY", "OP_UNIQUE_CARRIER", "ORIGIN", "MONTH"]
df_small = df_small.dropna(subset=["DEP_DELAY", "OP_UNIQUE_CARRIER", "ORIGIN", "MONTH"])
df_small = df_small[df_small["DEP_DELAY"].between(-60, 360)]
print("Clean shape:", df_small.shape)
df_small.head(3)


In [None]:
y = df_small["DEP_DELAY"]
X = df_small[["MONTH", "OP_UNIQUE_CARRIER", "ORIGIN"]]
num_features = ["MONTH"]
cat_features = ["OP_UNIQUE_CARRIER", "ORIGIN"]
pre = ColumnTransformer([
    ("num", "passthrough", num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
])
model = Pipeline([
    ("prep", pre),
    ("linreg", LinearRegression()),
])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)
r2 = r2_score(y_test, preds)
mse = mean_squared_error(y_test, preds)
print({"R2": r2, "MSE": mse})


In [None]:
ax = df_small["DEP_DELAY"].plot(kind="hist", bins=60, figsize=(6,4), title="Departure Delay Distribution")
ax.set_xlabel("Minutes")
plt.tight_layout()
plt.savefig("../outputs/delay_distribution.png", dpi=150)
plt.show()
ax = df_small.groupby("MONTH")["DEP_DELAY"].mean().plot(kind="bar", figsize=(6,4), title="Avg Departure Delay by Month")
ax.set_ylabel("Minutes")
plt.tight_layout()
plt.savefig("../outputs/avg_delay_by_month.png", dpi=150)
plt.show()
print("Saved charts to ../outputs")
