In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when

spark = SparkSession.builder.appName("CustomerOrdersJob").getOrCreate()


In [2]:
customers_path = "/opt/spark-apps/input/customers.csv"
orders_path = "/opt/spark-apps/input/orders.json"
output_path_csv = "/tmp/orders_enriched_csv"
output_path_parquet = "/tmp/orders_enriched_parquet"

In [3]:

df_customers = spark.read.option("header", True).csv(customers_path)
df_orders = spark.read.json(orders_path)


df_joined = df_orders.join(df_customers, on="customer_id", how="inner")

df_enriched = df_joined.withColumn(
    "order_type",
    when(df_joined.amount >= 200, "High Value")
    .when(df_joined.amount >= 100, "Medium Value")
    .otherwise("Low Value")
)

df_enriched.select("order_id", "name", "amount", "order_type").show()

df_enriched_op = df_enriched.select("order_id", "name", "amount", "order_type")
df_enriched_op.write.mode("overwrite").option("header", True).csv(output_path_csv)

+--------+----------+------+------------+
|order_id|      name|amount|  order_type|
+--------+----------+------+------------+
|    5001|  John Doe| 250.5|  High Value|
|    5002|Jane Smith| 145.0|Medium Value|
|    5003|Rita Mehra|389.99|  High Value|
|    5004| Wei Zhang| 89.99|   Low Value|
+--------+----------+------+------------+



In [4]:
output_path_csv


'/tmp/orders_enriched_csv'

In [5]:
df_enriched_op.write.mode("overwrite").parquet(output_path_parquet)