In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


In [3]:
DATA_DIR      = Path("data")
P3_CSV        = DATA_DIR / "flight_part3_ready.csv"
P2_META       = DATA_DIR / "flight_part2_meta.json"

OUT_DIR       = DATA_DIR          # keep all artifacts together
ARR_NPZ       = OUT_DIR / "part4_Xy_arrays.npz"
FEATS_JSON    = OUT_DIR / "part4_feature_names.json"
TRAIN_CSV     = OUT_DIR / "part4_train_matrix.csv"
TEST_CSV      = OUT_DIR / "part4_test_matrix.csv"

In [4]:
#4.0 Load data
assert P3_CSV.exists(), f"Cannot find Part 3 data at {P3_CSV.resolve()}"
df_features = pd.read_csv(P3_CSV, parse_dates=["FL_DATE"])

In [6]:
#  4.1 Shared names 
if P2_META.exists():
    with open(P2_META, "r") as f:
        META = json.load(f)
    LABEL_COL       = META.get("label_col", "DELAY_FLAG")
    LOW_CARD_COLS   = META.get("low_card_cols", ["AIRLINE", "WeekdayName"])
    HIGH_CARD_COLS  = META.get("high_card_cols", ["ORIGIN_CITY", "DEST_CITY", "ROUTE"])
    NUM_COLS        = META.get("num_cols", ["DISTANCE", "MONTH_NUM"])
    DATE_PARTS      = META.get("date_parts", ["FL_DATE", "YEAR", "MONTH_NUM", "DAY_OF_MONTH"])
    CARRY_ALONG     = META.get("carry_along", [])
else:
    LABEL_COL       = "DELAY_FLAG"
    LOW_CARD_COLS   = ["AIRLINE", "WeekdayName"]
    HIGH_CARD_COLS  = ["ORIGIN_CITY", "DEST_CITY", "ROUTE"]
    NUM_COLS        = ["DISTANCE", "MONTH_NUM"]
    DATE_PARTS      = ["FL_DATE", "YEAR", "MONTH_NUM", "DAY_OF_MONTH"]
   
    CARRY_ALONG     = [c for c in ["FL_NUMBER", "Month"] if c in df_features.columns]

ALL_NEEDED_FOR_MODEL = LOW_CARD_COLS + HIGH_CARD_COLS + NUM_COLS + [LABEL_COL]


In [None]:
for c in NUM_COLS:
    df_features[c] = pd.to_numeric(df_features[c], errors="coerce")
df_features[LABEL_COL] = pd.to_numeric(df_features[LABEL_COL], errors="coerce").astype("int8")
df_features = df_features.dropna(subset=ALL_NEEDED_FOR_MODEL).copy()


In [7]:
# 4.2 Train/Test Split 
train_df, test_df = train_test_split(
    df_features,
    test_size=0.20,
    random_state=42,
    stratify=df_features[LABEL_COL]
)


In [8]:
def _desc(name, d):
    return f"{name}: n={len(d):,}  delay_rate={d[LABEL_COL].mean():.3f}"
print(_desc("TRAIN", train_df))
print(_desc("TEST ", test_df))

TRAIN: n=588,899  delay_rate=0.219
TEST : n=147,225  delay_rate=0.219


In [9]:
# 4.3 Encoders 
# 4.3.1 Smoothed target mean encoding for HIGH_CARD_COLS (fit on TRAIN only)
def mean_encode(train_df, test_df, col, target, alpha=50):
    """
    Smoothed target-mean encoding:
      enc = (mean*count + alpha*global) / (count + alpha)
    Returns (train_encoded, test_encoded).
    """
    g = train_df.groupby(col)[target].agg(["mean", "count"])
    global_mean = train_df[target].mean()
    g["enc"] = (g["mean"] * g["count"] + alpha * global_mean) / (g["count"] + alpha)
    mapping = g["enc"]
    tr = train_df[col].map(mapping).fillna(global_mean).astype("float32")
    te = test_df[col].map(mapping).fillna(global_mean).astype("float32")
    return tr, te

for c in HIGH_CARD_COLS:
    tr_enc, te_enc = mean_encode(train_df, test_df, c, LABEL_COL, alpha=50)
    train_df[c + "_ENC"] = tr_enc
    test_df[c + "_ENC"]  = te_enc

ENC_HIGH_COLS = [c + "_ENC" for c in HIGH_CARD_COLS]

In [10]:
# 4.3.2 One-hot for LOW_CARD_COLS (fit on TRAIN; transform both)
OHE_LOW = OneHotEncoder(handle_unknown="ignore", sparse=False, dtype=np.float32)
OHE_LOW.fit(train_df[LOW_CARD_COLS])
ohe_cols = OHE_LOW.get_feature_names_out(LOW_CARD_COLS).tolist()


In [11]:
# 4.4 Build design matrices 
# Numeric block: NUM_COLS + encoded highs
ENC_NUM_COLS = NUM_COLS + ENC_HIGH_COLS

def build_matrix(df_part):
    X_low = OHE_LOW.transform(df_part[LOW_CARD_COLS])                  
    X_num = df_part[ENC_NUM_COLS].to_numpy(np.float32)                 
    X     = np.hstack([X_num, X_low])                                   
    y     = df_part[LABEL_COL].to_numpy(np.int32)
    return X, y

X_train, y_train = build_matrix(train_df)
X_test,  y_test  = build_matrix(test_df)


In [12]:
# Keep a human-readable column list for the design matrix
FEATURE_NAMES = ENC_NUM_COLS + ohe_cols

print("\nShapes:")
print("  X_train:", X_train.shape, "  y_train:", y_train.shape)
print("  X_test :", X_test.shape,  "  y_test :", y_test.shape)
print("\nFirst 10 feature names:", FEATURE_NAMES[:10])



Shapes:
  X_train: (588899, 29)   y_train: (588899,)
  X_test : (147225, 29)   y_test : (147225,)

First 10 feature names: ['DISTANCE', 'MONTH_NUM', 'ORIGIN_CITY_ENC', 'DEST_CITY_ENC', 'ROUTE_ENC', 'AIRLINE_Alaska Airlines Inc.', 'AIRLINE_Allegiant Air', 'AIRLINE_American Airlines Inc.', 'AIRLINE_Delta Air Lines Inc.', 'AIRLINE_Endeavor Air Inc.']


In [13]:
# 4.5 Save artifacts for Part 5 

np.savez_compressed(ARR_NPZ, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

# Feature names (keep the exact order)
with open(FEATS_JSON, "w") as f:
    json.dump({
        "FEATURE_NAMES": FEATURE_NAMES,
        "LOW_CARD_COLS": LOW_CARD_COLS,
        "HIGH_CARD_COLS": HIGH_CARD_COLS,
        "ENC_HIGH_COLS": ENC_HIGH_COLS,
        "NUM_COLS": NUM_COLS,
        "ENC_NUM_COLS": ENC_NUM_COLS,
        "LABEL_COL": LABEL_COL,
        "DATE_PARTS": DATE_PARTS,
        "CARRY_ALONG": CARRY_ALONG
    }, f, indent=2)

In [15]:
print("\n✅ Saved:")
print("  Arrays:", ARR_NPZ)
print("  Feature names:", FEATS_JSON)
print("  Train matrix CSV:", TRAIN_CSV)
print("  Test  matrix CSV:", TEST_CSV)


✅ Saved:
  Arrays: data/part4_Xy_arrays.npz
  Feature names: data/part4_feature_names.json
  Train matrix CSV: data/part4_train_matrix.csv
  Test  matrix CSV: data/part4_test_matrix.csv
