In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Task Set – PySpark Hands-On (No DLT)
Basics

In [0]:
# Load retail_data.csv into a PySpark DataFrame and display schema.
df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/retail_data.csv")
df.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TransactionDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)



In [0]:
# Infer schema as False — then manually cast columns.
from pyspark.sql.types import *
schema = StructType([
    StructField("TransactionID", StringType(), True),
    StructField("Customer", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("TotalPrice", DoubleType(), True),
    StructField("TransactionDate", DateType(), True),
    StructField("PaymentMode", StringType(), True)
])
df_manual = spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/retail_data.csv")
df_manual.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- TotalPrice: double (nullable = true)
 |-- TransactionDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)



Data Exploration & Filtering

In [0]:
# Filter transactions where TotalPrice > 40000 .
from pyspark.sql.functions import col
df.filter(col("TotalPrice") > 40000).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+-------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDa|
+-------------+--------+---------+-------+-----------+--------+---------+----------+-------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|   2024-01-15|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|   2024-01-20|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|   2024-02-15|
+-------------+--------+---------+-------+-----------+--------+---------+----------+-------------+



In [0]:
# Get unique cities from the dataset.
df.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [0]:
# Find all transactions from "Delhi" using .filter() and .where() .
df.filter(col("City") == "Delhi").show()
df.where(col("City") == "Delhi").show()

+-------------+--------+-----+-------+-----------+--------+---------+----------+-------------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDa|
+-------------+--------+-----+-------+-----------+--------+---------+----------+-------------+
|        T1004|    Zoya|Delhi|  Chair|  Furniture|       4|     5000|     20000|   2024-02-12|
|        T1006|   Farah|Delhi|  Mouse|Electronics|       3|     1000|      3000|   2024-02-18|
+-------------+--------+-----+-------+-----------+--------+---------+----------+-------------+

+-------------+--------+-----+-------+-----------+--------+---------+----------+-------------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDa|
+-------------+--------+-----+-------+-----------+--------+---------+----------+-------------+
|        T1004|    Zoya|Delhi|  Chair|  Furniture|       4|     5000|     20000|   2024-02-12|
|        T1006|   Farah|Delhi|  Mouse|Electronics

Data Manipulation

In [0]:
# Add a column DiscountedPrice = TotalPrice - 10%.
from pyspark.sql.functions import col
df = df.withColumn("DiscountedPrice", col("TotalPrice") * 0.9)
df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+-------------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDa|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+---------+----------+-------------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|   2024-01-15|        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|   2024-01-20|        54000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|   2024-02-10|        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|   2024-02-12|        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|   2024-02-15|        45000.0|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3

In [0]:
# Rename TransactionDate to TxnDate .
df = df.withColumnRenamed("TransactionDate", "TxnDate")
df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|        UPI|   1|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|Net Banking|   1|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|       Card|   1|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15| 

In [0]:
# Drop the column UnitPrice .
df = df.drop("UnitPrice")
df.show()

+-------------+--------+---------+-------+-----------+--------+----------+-------------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|TransactionDa|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+----------+-------------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|     70000|   2024-01-15|        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|     60000|   2024-01-20|        54000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|   2024-02-10|        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|   2024-02-12|        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|     50000|   2024-02-15|        45000.0|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|      3000|   2024-02-18|         2700.0|
+-------------+--------+---------+-------+-----------+-

Aggregations

In [0]:
# Get total sales by city.
df.groupBy("City").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()

+---------+----------+
|     City|TotalSales|
+---------+----------+
|Bangalore|     60000|
|   Mumbai|    120000|
|    Delhi|     23000|
|Hyderabad|     15000|
+---------+----------+



In [0]:
# Get average unit price by category.
df_with_price = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/retail_data.csv")
df_with_price.groupBy("Category").avg("UnitPrice").withColumnRenamed("avg(UnitPrice)", "AvgUnitPrice").show()


+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|Electronics|     37750.0|
|  Furniture|     10000.0|
+-----------+------------+



In [0]:
# Count of transactions grouped by PaymentMode.
df.groupBy("PaymentMode").count().show()

+-----------+-----+
|PaymentMode|count|
+-----------+-----+
|Net Banking|    1|
|       Card|    3|
|       Cash|    1|
|        UPI|    1|
+-----------+-----+



Window Functions


In [0]:
# Use a window partitioned by City to rank transactions by TotalPrice .
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
windowSpec = Window.partitionBy("City").orderBy(col("TotalPrice").desc())
df = df.withColumn("Rank", rank().over(windowSpec))
df.select("TransactionID", "City", "TotalPrice", "Rank").show()

+-------------+---------+----------+----+
|TransactionID|     City|TotalPrice|Rank|
+-------------+---------+----------+----+
|        T1002|Bangalore|     60000|   1|
|        T1004|    Delhi|     20000|   1|
|        T1006|    Delhi|      3000|   2|
|        T1003|Hyderabad|     15000|   1|
|        T1001|   Mumbai|     70000|   1|
|        T1005|   Mumbai|     50000|   2|
+-------------+---------+----------+----+



In [0]:
# Use lag function to get previous transaction amount per city.
from pyspark.sql.functions import lag
windowSpecLag = Window.partitionBy("City").orderBy("TxnDate")
df = df.withColumn("PrevTxn", lag("TotalPrice").over(windowSpecLag))
df.select("TransactionID", "City", "TotalPrice", "PrevTxn").show()

+-------------+---------+----------+-------+
|TransactionID|     City|TotalPrice|PrevTxn|
+-------------+---------+----------+-------+
|        T1002|Bangalore|     60000|   NULL|
|        T1004|    Delhi|     20000|   NULL|
|        T1006|    Delhi|      3000|  20000|
|        T1003|Hyderabad|     15000|   NULL|
|        T1001|   Mumbai|     70000|   NULL|
|        T1005|   Mumbai|     50000|  70000|
+-------------+---------+----------+-------+



Joins

In [0]:
# Create a second DataFrame city_region :
# City,Region
# Mumbai,West
# Delhi,North
# Bangalore,South
# Hyderabad,South
city_data = [("Mumbai", "West"), ("Delhi", "North"), ("Bangalore", "South"), ("Hyderabad", "South")]
region_df = spark.createDataFrame(city_data, ["City", "Region"])
region_df.show()

+---------+------+
|     City|Region|
+---------+------+
|   Mumbai|  West|
|    Delhi| North|
|Bangalore| South|
|Hyderabad| South|
+---------+------+



In [0]:
# Join with main DataFrame and group total sales by Region.
df_joined = df.join(region_df, "City")
df_joined.groupBy("Region").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()

+------+----------+
|Region|TotalSales|
+------+----------+
|  West|    120000|
| North|     23000|
| South|     75000|
+------+----------+



Nulls and Data Cleaning

In [0]:
# Introduce some nulls and replace them with default values.
from pyspark.sql.functions import lit, when,rand
df_with_nulls = df.withColumn("Quantity", 
    when(rand() > 0.7, None).otherwise(col("Quantity"))) \
    .withColumn("PaymentMode", when(rand() > 0.7, None).otherwise(col("PaymentMode")))
df_with_nulls.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|PrevTxn|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+
|        T1002|    Neha|Bangalore| Tablet|Electronics|    NULL|    30000|     60000|2024-01-20|        UPI|   1|   NULL|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|   NULL|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|  20000|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|    NULL|    15000|     15000|2024-02-10|Net Banking|   1|   NULL|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|       Card|   1|   NULL|
|        T1005|   Karan|   Mumba

In [0]:
# Drop rows where Quantity is null.
df_cleaned = df_with_nulls.dropna(subset=["Quantity"])
df_cleaned.show()

+-------------+--------+------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+
|TransactionID|Customer|  City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|PrevTxn|
+-------------+--------+------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+
|        T1004|    Zoya| Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|   NULL|
|        T1006|   Farah| Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|  20000|
|        T1001|     Ali|Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|       Card|   1|   NULL|
|        T1005|   Karan|Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|       Card|   2|  70000|
+-------------+--------+------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+



In [0]:
# Fill null PaymentMode with "Unknown".
df_filled = df_with_nulls.fillna({"PaymentMode": "Unknown"})
df_filled.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|PrevTxn|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+
|        T1002|    Neha|Bangalore| Tablet|Electronics|    NULL|    30000|     60000|2024-01-20|        UPI|   1|   NULL|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|   NULL|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|  20000|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|    NULL|    15000|     15000|2024-02-10|Net Banking|   1|   NULL|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|       Card|   1|   NULL|
|        T1005|   Karan|   Mumba

Custom Functions

In [0]:
# Write a UDF to label orders:
# def label_order(amount):
# if amount > 50000: return "High"
# elif amount >= 30000: return "Medium"
# else: return "Low"
# Apply this to classify TotalPrice .

from pyspark.sql.functions import udf
def label_order(amount):
    if amount > 50000:
        return "High"
    elif amount >= 30000:
        return "Medium"
    else:
        return "Low"

label_udf = udf(label_order, StringType())
df = df.withColumn("OrderLabel", label_udf(col("TotalPrice")))
df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|PrevTxn|OrderLabel|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|        UPI|   1|   NULL|      High|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|   NULL|       Low|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|  20000|       Low|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|Net Banking|   1|   NULL|       Low|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|   

Date & Time

In [0]:
# Extract year, month, and day from TxnDate .
from pyspark.sql.functions import year, month, dayofmonth
df = df.withColumn("Year", year("TxnDate")).withColumn("Month", month("TxnDate")).withColumn("Day", dayofmonth("TxnDate"))
df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|PrevTxn|OrderLabel|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+----+-----+---+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|        UPI|   1|   NULL|      High|2024|    1| 20|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|   NULL|       Low|2024|    2| 12|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|  20000|       Low|2024|    2| 18|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|Net Banking|   1|   NUL

In [0]:
# Filter transactions that happened in February.
df.filter(col("Month") == 2).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|PrevTxn|OrderLabel|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+----+-----+---+
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|   NULL|       Low|2024|    2| 12|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|  20000|       Low|2024|    2| 18|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|Net Banking|   1|   NULL|       Low|2024|    2| 10|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|       Card|   2|  7000

Union & Duplicate Handling

In [0]:
# Duplicate the DataFrame using union() and remove duplicates.
df_dup = df.union(df)
df_no_duplicates = df_dup.dropDuplicates()
df_no_duplicates.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|Rank|PrevTxn|OrderLabel|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+----+-------+----------+----+-----+---+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|        UPI|   1|   NULL|      High|2024|    1| 20|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|       Card|   1|   NULL|      High|2024|    1| 15|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|       Cash|   2|  20000|       Low|2024|    2| 18|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|       Card|   1|   NUL