In [0]:
from pyspark.sql.functions import col, year, month, dayofmonth, hour
from pyspark.ml.feature import StringIndexer

def feature_engineering(input_path, output_path):
    df = spark.read.option("header", "true").option("inferSchema", "true").csv(input_path)

    df = df.withColumn("Issued_date", col("Issued_date").cast("timestamp"))
    df = df.withColumn("Issued_year", year("Issued_date")) \
           .withColumn("Issued_month", month("Issued_date")) \
           .withColumn("Issued_day", dayofmonth("Issued_date")) \
           .withColumn("Issued_hour", hour("Issued_date")) \
           .drop("Issued_date")

    drop_cols = ['Unit_ID', 'Violation_ID', 'Tract']
    for c in drop_cols:
        if c in df.columns:
            df = df.drop(c)

    # # Handle categorical columns
    # cat_cols = [c for c, t in df.dtypes if t == 'string']
    # for c in cat_cols:
    #     indexer = StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid='keep')
    #     df = indexer.fit(df).transform(df).drop(c).withColumnRenamed(c+"_index", c)

    df.write.mode("overwrite").parquet(output_path)
    print(f"✅ Feature engineered data saved at: {output_path}")

# Run manually
input_path = "/Volumes/workspace/default/tutorial/reduced_sample.csv"
output_path = "/Volumes/workspace/default/tutorial/feature_engineered"
feature_engineering(input_path, output_path)
