# 04 — Generate Streamlit Input Options (data_schema.json)

The Streamlit UI reads `data/data_schema.json` to:
- render numeric inputs with min/max/median
- render dropdowns for categoricals with their frequency


In [1]:
# 04 — Generate Streamlit Options (data_schema.json)

from pathlib import Path
import json
import sqlite3

import pandas as pd

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

DB_PATH = PROJECT_ROOT / "data" / "airline.db"
SCHEMA_PATH = PROJECT_ROOT / "data" / "data_schema.json"

print("DB_PATH:", DB_PATH, "exists=", DB_PATH.exists())
print("SCHEMA_PATH:", SCHEMA_PATH)


DB_PATH: c:\Users\nepal\OneDrive\Desktop\airline_satisfaction_appp\data\airline.db exists= True
SCHEMA_PATH: c:\Users\nepal\OneDrive\Desktop\airline_satisfaction_appp\data\data_schema.json


In [2]:
if not DB_PATH.exists():
    raise FileNotFoundError(
        f"Database not found at {DB_PATH}. Run 01_create_database.ipynb first."
    )

conn = sqlite3.connect(DB_PATH)

query = '''
SELECT
    p.gender,
    p.customer_type,
    p.age,
    t.type_of_travel,
    t.travel_class,
    t.flight_distance,
    s.inflight_wifi_service,
    s.departure_arrival_time_convenient,
    s.ease_of_online_booking,
    s.gate_location,
    s.food_and_drink,
    s.online_boarding,
    s.seat_comfort,
    s.inflight_entertainment,
    s.on_board_service,
    s.leg_room_service,
    s.baggage_handling,
    s.checkin_service,
    s.inflight_service,
    s.cleanliness,
    d.departure_delay_minutes,
    d.arrival_delay_minutes,
    sat.satisfaction_binary
FROM trip t
JOIN passenger p ON t.passenger_id = p.passenger_id
JOIN service_rating s ON s.trip_id = t.trip_id
JOIN delay d ON d.trip_id = t.trip_id
JOIN satisfaction sat ON sat.trip_id = t.trip_id
;
'''
df = pd.read_sql_query(query, conn)
conn.close()

TARGET = "satisfaction_binary"

categorical_cols = ["gender", "customer_type", "type_of_travel", "travel_class"]
numerical_cols = [c for c in df.columns if c not in categorical_cols + [TARGET]]

schema = {"numerical": {}, "categorical": {}}

# Numerical stats
for c in numerical_cols:
    series = pd.to_numeric(df[c], errors="coerce")
    schema["numerical"][c] = {
        "min": float(series.min()),
        "max": float(series.max()),
        "mean": float(series.mean()),
        "median": float(series.median()),
    }

# Categorical unique values + counts
for c in categorical_cols:
    vals = df[c].astype(str).fillna("Unknown")
    vc = vals.value_counts().to_dict()
    schema["categorical"][c] = {
        "unique_values": list(vc.keys()),
        "value_counts": {k: int(v) for k, v in vc.items()},
    }

SCHEMA_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(SCHEMA_PATH, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2)

print("✅ Wrote schema:", SCHEMA_PATH)
display(pd.DataFrame(schema["numerical"]).T.head())


✅ Wrote schema: c:\Users\nepal\OneDrive\Desktop\airline_satisfaction_appp\data\data_schema.json


Unnamed: 0,min,max,mean,median
age,7.0,85.0,39.379706,40.0
flight_distance,31.0,4983.0,1189.448375,843.0
inflight_wifi_service,0.0,5.0,2.729683,3.0
departure_arrival_time_convenient,0.0,5.0,3.060296,3.0
ease_of_online_booking,0.0,5.0,2.756901,3.0
