In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when

In [4]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("DataPreparation") \
    .getOrCreate()

# Read the CSV file from MinIO bucket and load it into a DataFrame
df = spark.read.format("csv") \
    .option("header", "true") \
    .load("s3a://compteurbucket/compteurs.csv")

# Convert columns with negative values to positive values
preprocessed_df = df.withColumn('power_factor', when(df['power_factor'] >= 0, df['power_factor']).otherwise(0)) \
    .withColumn('compteur_id', when(df['compteur_id'].isNull(), 0).otherwise(df['compteur_id'])) \
    .withColumn('voltage', when(df['voltage'].isNull(), 0).otherwise(df['voltage'])) \
    .withColumn('current', when(df['current'].isNull(), 0).otherwise(df['current']))

# Write preprocessed data to Parquet
df.write.format("parquet") \
    .mode("overwrite") \
    .save("s3a://compteurbucket/output-compteurdata.parquet")


# Stop the Spark session
spark.stop()