# Avocado Dataset - Silver Layer
This notebook performs data cleaning, encoding, and saves two versions:
- One for analytics
- One for machine learning

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, year, col, sum as _sum
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("AvocadoProject").getOrCreate()

In [None]:
# Load the dataset (bronze)
df = spark.read.csv("/dbfs/mnt/Avocado_Project/Bronze/avocado_dataset.csv", header=True, inferSchema=True)

In [None]:
# Rename columns for readability
df = df\
    .withColumnRenamed("date", "Date")\
    .withColumnRenamed("average_price", "AveragePrice")\
    .withColumnRenamed("total_volume", "Volume")\
    .withColumnRenamed("4046", "SmallHass")\
    .withColumnRenamed("4225", "LargeHass")\
    .withColumnRenamed("4770", "XLargeHass")\
    .withColumnRenamed("total_bags", "TotalBags")\
    .withColumnRenamed("small_bags", "SmallBags")\
    .withColumnRenamed("large_bags", "LargeBags")\
    .withColumnRenamed("x_large_bags", "XLargeBags")\
    .withColumnRenamed("type", "Type")\
    .withColumnRenamed("year", "Year")\
    .withColumnRenamed("geography", "Region")

In [None]:
# Convert date and extract year again
df = df.withColumn("date", to_date(col("Date"), "MM/dd/yyyy"))
df = df.withColumn("Year", year(col("date")))

In [None]:
# Null values
null_counts = df.select([_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

In [None]:
# Drop duplicates
df = df.dropDuplicates()

In [None]:
# StringIndexer + OneHotEncoder
indexer_type = StringIndexer(inputCol="Type", outputCol="type_index")
df = indexer_type.fit(df).transform(df)

indexer_region = StringIndexer(inputCol="Region", outputCol="region_index")
df = indexer_region.fit(df).transform(df)

encoder = OneHotEncoder(
    inputCols=["type_index", "region_index"],
    outputCols=["type_vec", "region_vec"]
)

df_encoded = encoder.fit(df).transform(df)

In [None]:
# Drop encoded columns
df_encoded = df_encoded.drop("Type", "Region", "type_index", "region_index")

In [None]:
# Save Silver Layer
df.write.mode("overwrite").parquet("/dbfs/mnt/Avocado_Project/Silver/avocado_silver.parquet")
df_encoded.write.mode("overwrite").parquet("/dbfs/mnt/Avocado_Project/Silver/avocado_silver_ml_ready.parquet")