In [0]:
def replace_missing(input_path, output_path):
    df = spark.read.parquet(input_path)

    # Handle numeric columns: fill with median
    numeric_cols = [f.name for f in df.schema.fields if str(f.dataType) in 
                    ['IntegerType', 'DoubleType', 'LongType', 'FloatType']]
    for col_name in numeric_cols:
        median_val = df.approxQuantile(col_name, [0.5], 0.01)[0]
        df = df.na.fill({col_name: median_val})

    # Handle categorical columns: fill with "Unknown"
    cat_cols = [f.name for f in df.schema.fields if str(f.dataType) == 'StringType']
    for col_name in cat_cols:
        df = df.na.fill({col_name: "Unknown"})

    # Save cleaned data
    df.write.mode("overwrite").parquet(output_path)
    print(f"✅ Missing values replaced (numeric → median, categorical → 'Unknown'). Data saved at: {output_path}")


# Run manually
input_path = "/Volumes/workspace/default/tutorial/feature_engineered"
output_path = "/Volumes/workspace/default/tutorial/missing_replaced"
replace_missing(input_path, output_path)
