In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySpark_Hadoop")\
.master("local[*]")\
.config("spark.sql.autoBroadcastJoinThreshold", "-1")\
.config("spark.sql.adaptive.enabled", False)\
.config("spark.sql.adaptive.coalescePartitions.enabled", False)\
.config("spark.sql.adaptive.skewJoin.enabled", False)\
.getOrCreate()

In [150]:
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [154]:
spark.stop()
print("Spark Session stopped successfully.")

Spark Session stopped successfully.


In [156]:
print("Customers DataFrame")
customer_df = spark.read.csv("file:////home/dominic/Desktop/pythonLearning/csvFiles/practice_Data/customers.csv", header=True, inferSchema=False)
customer_df = customer_df.repartition(4)

print("Sales DataFrame")
sales_df = spark.read.csv("file:////home/dominic/Desktop/pythonLearning/csvFiles/practice_Data/sales.csv", header=True, inferSchema=False)
sales_df = sales_df.repartition(4)

print("Product DataFrame")
products_df = spark.read.csv("file:////home/dominic/Desktop/pythonLearning/csvFiles/practice_Data/products.csv", header=True, inferSchema=False)
products_df.repartition(4)

Customers DataFrame
Sales DataFrame
Product DataFrame


DataFrame[product_id: string, product_name: string, category: string]

In [157]:
sales_df_clean = sales_df.drop("region").withColumnRenamed("customer_id", "customer_sales_id") \
.withColumnRenamed("product_id", "product_sales_id")
sales_df_clean = sales_df_clean.repartition(4)

In [158]:
from pyspark.sql.functions import col

# Explicitly repartition each DF on its own join key
sales_df_clean = sales_df_clean.repartition(4, col("product_sales_id"))
products_df = products_df.repartition(4, col("product_id"))

# Now join — this causes shuffle because Spark needs to align keys
#joined_df = sales_df_clean.join(products_df, sales_df_clean.product_sales_id == products_df.product_id, "inner")


joined_df = sales_df_clean.join(products_df, sales_df_clean.product_sales_id == products_df.product_id, "inner")\
    .join(customer_df, sales_df_clean.customer_sales_id == customer_df.customer_id, "inner")

print("Joined DataFrame")

joined_df = joined_df.repartition(4)
joined_df = joined_df.withColumn("TotalCost", col("unit_price") * col("quantity"))
#joined_df.groupBy("region").count().show()
joined_df.write.mode("overwrite").parquet("file:////home/dominic/Desktop/pythonLearning/csvFiles/practice_Data/joined_data.parquet")

Joined DataFrame


                                                                                

In [120]:
joined_df.explain(True)

== Parsed Logical Plan ==
'Project [order_id#5585, order_date#5586, customer_sales_id#6880, product_sales_id#6887, quantity#5589, unit_price#5590, product_id#5616, product_name#5617, category#5618, customer_id#5562, customer_name#5563, region#5564, ('unit_price * 'quantity) AS TotalCost#6936]
+- Repartition 4, true
   +- Join Inner, (customer_sales_id#6880 = customer_id#5562)
      :- Join Inner, (product_sales_id#6887 = product_id#5616)
      :  :- RepartitionByExpression [product_sales_id#6887], 4
      :  :  +- Repartition 4, true
      :  :     +- Project [order_id#5585, order_date#5586, customer_sales_id#6880, product_id#5588 AS product_sales_id#6887, quantity#5589, unit_price#5590]
      :  :        +- Project [order_id#5585, order_date#5586, customer_id#5587 AS customer_sales_id#6880, product_id#5588, quantity#5589, unit_price#5590]
      :  :           +- Project [order_id#5585, order_date#5586, customer_id#5587, product_id#5588, quantity#5589, unit_price#5590]
      :  :      

In [113]:
df = spark.read.parquet("file:////home/dominic/Desktop/pythonLearning/csvFiles/practice_Data/joined_data.parquet/part-00000-78e92223-8b57-49eb-867b-12eafcb2c87c-c000.snappy.parquet")
print("DataFrame Count: ", df.count())
df.show()

DataFrame Count:  24999
+--------+----------+-----------------+----------------+--------+----------+----------+------------+-----------+-----------+-------------+------+
|order_id|order_date|customer_sales_id|product_sales_id|quantity|unit_price|product_id|product_name|   category|customer_id|customer_name|region|
+--------+----------+-----------------+----------------+--------+----------+----------+------------+-----------+-----------+-------------+------+
| O000643|2023-06-21|            C0343|           P0074|       4|       641|     P0074|  Product_74|Electronics|      C0343| Customer_343|  West|
| O074735|2023-06-29|            C0788|           P0140|       4|      1630|     P0140| Product_140|   Clothing|      C0788| Customer_788| South|
| O091335|2023-03-14|            C0384|           P0149|       2|      1121|     P0149| Product_149|  Furniture|      C0384| Customer_384| South|
| O064830|2023-04-01|            C0838|           P0072|       1|      1243|     P0072|  Product_72|