In [21]:
import json
from pathlib import Path
import pandas as pd


In [26]:
DATA_DIR      = Path("data")
META_PATH     = DATA_DIR / "flight_part2_meta.json"
CSV_P2        = DATA_DIR / "flight_part2_ready.csv"
PARQUET_P2    = DATA_DIR / "flight_part2_ready.parquet"
RAW_CSV       = Path("flight_1yr.csv")
OUT_P3        = DATA_DIR / "flight_part3_ready.csv"


In [27]:
DATA_DIR.mkdir(parents=True, exist_ok=True)

In [28]:
# ---------- 3.0 Load df_part2 or reconstruct if needed ----------
def _build_from_raw():
    assert RAW_CSV.exists(), f"Cannot find {RAW_CSV.resolve()}"
    d = pd.read_csv(RAW_CSV)
    # Create columns as in Part 2
    d["DELAY_FLAG"] = (pd.to_numeric(d["ARR_DELAY"], errors="coerce") >= 15).astype("int8")
    d["ROUTE"] = d["ORIGIN_CITY"].astype(str) + " \u2192 " + d["DEST_CITY"].astype(str)
    d["FL_DATE"] = pd.to_datetime(d["FL_DATE"], errors="coerce")
    d["YEAR"] = d["FL_DATE"].dt.year.astype("Int16")
    d["MONTH_NUM"] = d["FL_DATE"].dt.month.astype("Int8")
    d["DAY_OF_MONTH"] = d["FL_DATE"].dt.day.astype("Int8")
    d["DISTANCE"] = pd.to_numeric(d["DISTANCE"], errors="coerce")
    return d


In [29]:
if 'df_part2' not in globals():
    if PARQUET_P2.exists():
        df_part2 = pd.read_parquet(PARQUET_P2)
    elif CSV_P2.exists():
        df_part2 = pd.read_csv(CSV_P2, parse_dates=["FL_DATE"])
    else:
        df_part2 = _build_from_raw()


In [30]:
# 3.1 Shared names
if META_PATH.exists():
    with open(META_PATH, "r") as f:
        _meta = json.load(f)
    LABEL_COL       = _meta.get("label_col", "DELAY_FLAG")
    LOW_CARD_COLS   = _meta.get("low_card_cols", ["AIRLINE", "WeekdayName"])
    HIGH_CARD_COLS  = _meta.get("high_card_cols", ["ORIGIN_CITY", "DEST_CITY", "ROUTE"])
    NUM_COLS        = _meta.get("num_cols", ["DISTANCE", "MONTH_NUM"])
    DATE_PARTS      = _meta.get("date_parts", ["FL_DATE", "YEAR", "MONTH_NUM", "DAY_OF_MONTH"])
    CARRY_ALONG     = _meta.get("carry_along", [])
else:
    LABEL_COL       = "DELAY_FLAG"
    LOW_CARD_COLS   = ["AIRLINE", "WeekdayName"]
    HIGH_CARD_COLS  = ["ORIGIN_CITY", "DEST_CITY", "ROUTE"]
    NUM_COLS        = ["DISTANCE", "MONTH_NUM"]
    DATE_PARTS      = ["FL_DATE", "YEAR", "MONTH_NUM", "DAY_OF_MONTH"]
    CARRY_ALONG     = [c for c in ["FL_NUMBER", "Month"] if c in df_part2.columns]


In [31]:
# 3.2 Validate columns & define keep lists 
ALL_NEEDED_FOR_MODEL = LOW_CARD_COLS + HIGH_CARD_COLS + NUM_COLS + [LABEL_COL]
ALL_KEEP = list(dict.fromkeys(ALL_NEEDED_FOR_MODEL + DATE_PARTS + CARRY_ALONG))

missing = [c for c in ALL_KEEP if c not in df_part2.columns]
assert not missing, f"Missing columns in df_part2: {missing}"


In [32]:
# 3.3 Drop rows with critical NA & enforce dtypes 
before_n = len(df_part2)
df_features = df_part2.dropna(subset=ALL_NEEDED_FOR_MODEL).copy()

for c in NUM_COLS:
    df_features[c] = pd.to_numeric(df_features[c], errors="coerce")
if df_features[NUM_COLS].isna().any(axis=1).any():
    df_features = df_features.dropna(subset=NUM_COLS)

df_features[LABEL_COL] = pd.to_numeric(df_features[LABEL_COL], errors="coerce").astype("int8")

after_n = len(df_features)

In [33]:
print(f"Rows kept for modeling: {after_n:,} / {before_n:,}  ({after_n/before_n:.2%})")
print(f"Delay rate (kept rows): {df_features[LABEL_COL].mean():.4f}")


Rows kept for modeling: 736,124 / 736,124  (100.00%)
Delay rate (kept rows): 0.2187


In [34]:
# 3.4 Diagnostics 
print("\nMissingness (key columns, top 10):")
print(df_features[ALL_NEEDED_FOR_MODEL].isna().mean().sort_values(ascending=False).head(10))

print("\nCardinalities:")
for c in LOW_CARD_COLS + HIGH_CARD_COLS:
    print(f"  {c}: {df_features[c].nunique()}")



Missingness (key columns, top 10):
AIRLINE        0.0
WeekdayName    0.0
ORIGIN_CITY    0.0
DEST_CITY      0.0
ROUTE          0.0
DISTANCE       0.0
MONTH_NUM      0.0
DELAY_FLAG     0.0
dtype: float64

Cardinalities:
  AIRLINE: 17
  WeekdayName: 7
  ORIGIN_CITY: 359
  DEST_CITY: 358
  ROUTE: 5997


In [36]:
#3.5 Keep only the columns we carry forward & save
df_features = df_features[ALL_KEEP].copy()
df_features.to_csv(OUT_P3, index=False)
print("\n Saved sanitized features to:", OUT_P3)



 Saved sanitized features to: data/flight_part3_ready.csv


In [37]:
# 3.6 Preview 
_preview = ["FL_DATE"] + LOW_CARD_COLS + ["ORIGIN_CITY","DEST_CITY","ROUTE"] + NUM_COLS + [LABEL_COL]
_preview = [c for c in _preview if c in df_features.columns]
display(df_features[_preview].head(10))


Unnamed: 0,FL_DATE,AIRLINE,WeekdayName,ORIGIN_CITY,DEST_CITY,ROUTE,DISTANCE,MONTH_NUM,DELAY_FLAG
0,2022-11-19,Delta Air Lines Inc.,Saturday,"Minneapolis, MN","Seattle, WA","Minneapolis, MN → Seattle, WA",1399,11,0
1,2023-03-06,Delta Air Lines Inc.,Monday,"Minneapolis, MN","San Francisco, CA","Minneapolis, MN → San Francisco, CA",1589,3,1
2,2023-06-11,American Airlines Inc.,Sunday,"Washington, DC","Boston, MA","Washington, DC → Boston, MA",399,6,0
3,2023-02-12,Spirit Air Lines,Sunday,"Houston, TX","Los Angeles, CA","Houston, TX → Los Angeles, CA",1379,2,0
4,2022-11-12,Delta Air Lines Inc.,Saturday,"Grand Rapids, MI","Minneapolis, MN","Grand Rapids, MI → Minneapolis, MN",408,11,0
5,2022-09-06,American Airlines Inc.,Tuesday,"Dallas/Fort Worth, TX","Pensacola, FL","Dallas/Fort Worth, TX → Pensacola, FL",604,9,0
6,2023-04-15,Republic Airline,Saturday,"Minneapolis, MN","Newark, NJ","Minneapolis, MN → Newark, NJ",1008,4,1
7,2023-01-03,Allegiant Air,Tuesday,"Concord, NC","Sanford, FL","Concord, NC → Sanford, FL",457,1,0
8,2023-07-23,United Air Lines Inc.,Sunday,"Austin, TX","Houston, TX","Austin, TX → Houston, TX",140,7,0
9,2022-10-08,American Airlines Inc.,Saturday,"Dallas/Fort Worth, TX","Tucson, AZ","Dallas/Fort Worth, TX → Tucson, AZ",813,10,0
