Creating spark session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("Exercise_1") \
                    .getOrCreate() 
spark

<pyspark.sql.connect.session.SparkSession at 0x7f1968498c10>

##  Part 2: Spark Tasks

1. Ingest the CSV files into two PySpark DataFrames

In [0]:
customer_df = spark.read.format("csv").option("inferSchema", "true") \
                                      .option("header", "true") \
                                      .load('/Volumes/ashwin_harish/default/exercises/customers.csv')

order_df = spark.read.format("csv").option("inferSchema", "true") \
                                   .option("header", "true") \
                                   .load('/Volumes/ashwin_harish/default/exercises/orders.csv')
                                

2. Infer schema and print the schema for both


In [0]:
print("\n Customer Dataframe Schema")
customer_df.printSchema()

print("\n Order dataframe Schema")
order_df.printSchema()


 Customer Dataframe Schema
root
 |-- customerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)


 Order dataframe Schema
root
 |-- orderID: integer (nullable = true)
 |-- customerID: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- orderDate: date (nullable = true)



3. Add a column TotalAmount = Quantity
price to orders

In [0]:
from pyspark.sql.functions import col
order_df = order_df.withColumn("TotalAmount", col("Quantity") * col("price"))
order_df.select(
                "OrderID",
                "Product",
                "Quantity",
                "price",
                "TotalAmount"
               ).show()

+-------+-------+--------+-----+-----------+
|OrderID|Product|Quantity|price|TotalAmount|
+-------+-------+--------+-----+-----------+
|   1001| Laptop|       1|70000|      70000|
|   1002| Mobile|       2|25000|      50000|
|   1003|   Desk|       1|10000|      10000|
|   1004|  Mouse|       3| 1000|       3000|
|   1005|Monitor|       1|12000|      12000|
+-------+-------+--------+-----+-----------+



4. Join both DataFrames on Customer ID

In [0]:
joined_df = customer_df.join(order_df, customer_df.customerID == order_df.customerID, "inner") \
                       .drop(customer_df.customerID)
                       
joined_df.show()

+-----+---------+---+-------+----------+-------+--------+-----+----------+-----------+
| Name|     City|Age|orderID|customerID|product|Quantity|price| orderDate|TotalAmount|
+-----+---------+---+-------+----------+-------+--------+-----+----------+-----------+
|Aditi|   Mumbai| 28|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|Rohan|    Delhi| 35|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|Meena|Bangalore| 41|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|Aditi|   Mumbai| 28|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|Kabir|Hyderabad| 30|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-----+---------+---+-------+----------+-------+--------+-----+----------+-----------+



5. Filter orders where 
TotalAmount > 20000

In [0]:
order_df.filter(col("TotalAmount") > 20000) \
         .select(
                  "customerID",
                  "product",
                  "Quantity",
                  "price",
                  "TotalAmount"
                ).show()

+----------+-------+--------+-----+-----------+
|customerID|product|Quantity|price|TotalAmount|
+----------+-------+--------+-----+-----------+
|       101| Laptop|       1|70000|      70000|
|       102| Mobile|       2|25000|      50000|
+----------+-------+--------+-----+-----------+



6. Show customers who placed more than 1 order

In [0]:
joined_df.groupBy(col("customerID")) \
         .count() \
         .filter(col("count") > 1) \
         .withColumnRenamed("count", "TotalOrders") \
         .show()

+----------+-----------+
|customerID|TotalOrders|
+----------+-----------+
|       101|          2|
+----------+-----------+



7. Group orders by City and get average order value

In [0]:
from pyspark.sql.functions import avg
joined_df.groupBy(col("City")) \
         .agg(avg(col("TotalAmount")).alias("orderValue")) \
         .show()

+---------+----------+
|     City|orderValue|
+---------+----------+
|Bangalore|   10000.0|
|    Delhi|   50000.0|
|   Mumbai|   36500.0|
|Hyderabad|   12000.0|
+---------+----------+



8. Sort orders by OrderDate in descending order

In [0]:
order_df = order_df.sort(col("OrderDate").desc())
order_df.show()

+-------+----------+-------+--------+-----+----------+-----------+
|orderID|customerID|product|Quantity|price| orderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
+-------+----------+-------+--------+-----+----------+-----------+



9. write the final result as a parquet file partitioned by City

In [0]:
joined_df.write.mode("overwrite") \
               .partitionBy("city") \
               .parquet("dbfs:/FileStore/joined_df_customer_order")

Create a temporary view and run Spark SQL: \
 Total sales by customer \
 Count of products per city \
 Top 2 cities by revenu 

In [0]:
joined_df.createOrReplaceTempView("joined")
joined_df.show()

+-----+---------+---+-------+----------+-------+--------+-----+----------+-----------+
| Name|     City|Age|orderID|customerID|product|Quantity|price| orderDate|TotalAmount|
+-----+---------+---+-------+----------+-------+--------+-----+----------+-----------+
|Aditi|   Mumbai| 28|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|Rohan|    Delhi| 35|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|Meena|Bangalore| 41|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|Aditi|   Mumbai| 28|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|Kabir|Hyderabad| 30|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-----+---------+---+-------+----------+-------+--------+-----+----------+-----------+



Total Sales by customer

In [0]:
spark.sql("""
          SELECT 
              customerID, 
              Name, 
              SUM(TotalAmount) AS TotalSales
          FROM joined
          GROUP BY customerID, Name
          """
).show()


+----------+-----+----------+
|customerID| Name|TotalSales|
+----------+-----+----------+
|       104|Kabir|     12000|
|       102|Rohan|     50000|
|       103|Meena|     10000|
|       101|Aditi|     73000|
+----------+-----+----------+



Count of orders per city

In [0]:
spark.sql("""
          SELECT 
            City, 
            COUNT(*) AS OrderCount
          FROM joined
          GROUP BY City
""").show()

+---------+----------+
|     City|OrderCount|
+---------+----------+
|Bangalore|         1|
|    Delhi|         1|
|   Mumbai|         2|
|Hyderabad|         1|
+---------+----------+



Top 2 cities by revenue

In [0]:
spark.sql("""
            SELECT 
                city, 
                SUM(TotalAmount) AS TotalRevenue
            FROM joined
            GROUP BY City
            ORDER BY TotalRevenue DESC
            LIMIT 2
""").show()

+------+------------+
|  city|TotalRevenue|
+------+------------+
|Mumbai|       73000|
| Delhi|       50000|
+------+------------+

