#Data Preparation

Import Statements

In [101]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, year, when, udf, concat_ws, lit, regexp_replace, to_date, datediff, current_date
from pyspark.sql.types import StringType


Creating Session

In [2]:
spark = SparkSession.builder.appName("PracticeProject").enableHiveSupport().getOrCreate()

Sample Customer and Orders Data

In [3]:
customers_data = [
(101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
(102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
(103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
(104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
(105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]
orders_data = [
(1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
(2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
(3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
(4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
(5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
(6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
(7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]


Creatin DataFrame

In [38]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")
customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

#SECTION A: PySpark DataFrame Tasks

1. Add a column TotalAmount = Price * Quantity to the orders_df


In [45]:
orders_df = orders_df.withColumn("TotalAmount", orders_df.Price * orders_df.Quantity)
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



2. Filter all orders with TotalAmount > 10000 .

In [46]:
orders_df.filter(orders_df.TotalAmount > 10000).show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+-------+-----------+--------+-------+----------+-----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+-------+-----------+--------+-------+----------+-----------+



3. Standardize the City field in customers_df (e.g: lowercase)

In [47]:
customer_df = customers_df.withColumn("City", lower("City"))
customer_df.show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+



4.  Extract year from OrderDate and add a new column OrderYear .

In [48]:
orders_df = orders_df.withColumn("OrderYear", year("OrderDate"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

 5. Fill null values in any column of your choice with defaults.

In [49]:
customer_df = customer_df.fillna({"City": "Unknown"})
customer_df.show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+



6. Use when/otherwise to categorize orders: \
 <5000 : "Low" \
 5000-20000 : "Medium" \
 20000: "High"

In [59]:
orders_df = orders_df.withColumn("OrderCategory",
                                 when(col("TotalAmount") < 5000, "Low")
                                 .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
                                 .otherwise("High")
                                 )
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+-------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderCategory|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+-------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|         High|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|          Low|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|       Medium|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|          Low|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|       Medium|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|          Low|
|      7|       102|    Phone|Electro

# SECTION B: Spark SQL Tasks

 7. Run a SQL query to list all orders made by “Ali”.

In [63]:
spark.sql("""
          SELECT *
          FROM orders
          WHERE CustomerID IN (
              SELECT CustomerID
              FROM customers
              WHERE Name = 'Ali'
    )
""").show()

+-------+----------+-------+-----------+--------+-------+----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+-------+-----------+--------+-------+----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|
+-------+----------+-------+-----------+--------+-------+----------+



 8. Get total spending by each customer using SQL

In [65]:
spark.sql("""
          SELECT
            c.Name,
            SUM(o.Price * o.Quantity) AS TotalAmountSpend
          FROM customers c
          JOIN orders o
          ON c.CustomerID = o.CustomerID
          GROUP BY c.Name, c.CustomerID
          ORDER BY TotalAmountSpend DESC
""").show()


+-----+----------------+
| Name|TotalAmountSpend|
+-----+----------------+
|  Ali|        101200.0|
| Neha|         50000.0|
|Sneha|          5000.0|
| Ravi|          3500.0|
| Amit|          2500.0|
+-----+----------------+



 9. Find out which category made the highest total revenue.

In [73]:
spark.sql("""
          SELECT
            Category,
            SUM(Price * Quantity) AS TotalRevenue,
            RANK() OVER (ORDER BY SUM(Price * Quantity) DESC) AS RevenueRank
          FROM orders
          GROUP BY Category
""").show()

+-----------+------------+-----------+
|   Category|TotalRevenue|RevenueRank|
+-----------+------------+-----------+
|Electronics|    151200.0|          1|
| Appliances|      5000.0|          2|
|  Furniture|      3500.0|          3|
| Stationery|      2500.0|          4|
+-----------+------------+-----------+



10. Create a view customer_orders showing CustomerName, Product, TotalAmount .

In [78]:
spark.sql("""
          CREATE OR REPLACE VIEW customer_orders AS
          SELECT
            c.Name AS CustomerName,
            o.Product,
            SUM(o.Price * o.Quantity) OVER(PARTITION BY c.CustomerID) AS TotalAmount
          FROM customers c
          JOIN orders o
          ON c.CustomerID = o.CustomerID

""")

spark.sql("SELECT * FROM customer_orders").show()


+------------+---------+-----------+
|CustomerName|  Product|TotalAmount|
+------------+---------+-----------+
|         Ali|   Laptop|   101200.0|
|         Ali|    Mouse|   101200.0|
|        Neha|   Tablet|    50000.0|
|        Neha|    Phone|    50000.0|
|        Ravi|Bookshelf|     3500.0|
|       Sneha|    Mixer|     5000.0|
|        Amit| Notebook|     2500.0|
+------------+---------+-----------+



11. Query the view for products ordered after Feb 2024.



In [88]:
spark.sql("""
    SELECT
      co.CustomerName,
      co.Product,
      co.TotalAmount,
      o.OrderDate
    FROM customer_orders co
    JOIN customers c ON co.CustomerName = c.Name
    JOIN orders o ON o.Product = co.Product AND o.CustomerID = c.CustomerID
    WHERE o.OrderDate > '2024-02-29'
""").show()


+------------+--------+-----------+----------+
|CustomerName| Product|TotalAmount| OrderDate|
+------------+--------+-----------+----------+
|        Neha|   Phone|    50000.0|2024-03-02|
|        Amit|Notebook|     2500.0|2024-03-01|
+------------+--------+-----------+----------+



#SECTION C: Advanced Practice

12. Create a Global Temp View from
customers_df , then \
query it using:
 SELECT * FROM \
 global_temp.customers WHERE City = 'Mumbai'

In [89]:
customers_df.createOrReplaceGlobalTempView("customers")
spark.sql("""
    SELECT *
    FROM global_temp.customers
    WHERE City = 'Mumbai'
""").show()

+----------+----+-------------+------+----------+
|CustomerID|Name|        Email|  City|SignupDate|
+----------+----+-------------+------+----------+
|       101| Ali|ali@gmail.com|Mumbai|2022-05-10|
+----------+----+-------------+------+----------+



13 Save the transformed
orders_df (with TotalAmount) to a Parquet file.

In [90]:
orders_df.write.mode("overwrite").parquet("/path/to/save/orders_parquet")

14. Read back the Parquet file and count how many orders are in it.

In [91]:
orders_parquet_df = spark.read.parquet("/path/to/save/orders_parquet")

order_count = orders_parquet_df.count()

print(f"Total number of orders: {order_count}")

Total number of orders: 7


#SECTION D: UDF + Built-in Function Tasks

15. Write a UDF that masks emails like: \
ali@gmail.com → a***@gmail.com .

In [94]:
def email_mask(email):
    if email and "@" in email:
        name, domain = email.split("@")
        if len(name) > 1:
            return name[0] + "***@" + domain
    return email

email_mask_udf = udf(email_mask, StringType())

df = customers_df.withColumn("masked_email", email_mask_udf("Email"))
df.select("Email", "masked_email").show(truncate=False)

+-----------------+----------------+
|Email            |masked_email    |
+-----------------+----------------+
|ali@gmail.com    |a***@gmail.com  |
|neha@yahoo.com   |n***@yahoo.com  |
|ravi@hotmail.com |r***@hotmail.com|
|sneha@outlook.com|s***@outlook.com|
|amit@gmail.com   |a***@gmail.com  |
+-----------------+----------------+



16. Use concat_ws() to create a full label like:
'Ali from Mumbai

In [99]:
df = customers_df.withColumn("full_label", concat_ws(" ", "Name", lit("from"), "City"))
df.select("Name", "City", "full_label").show()

+-----+---------+--------------------+
| Name|     City|          full_label|
+-----+---------+--------------------+
|  Ali|   Mumbai|     Ali from Mumbai|
| Neha|    Delhi|     Neha from Delhi|
| Ravi|Bangalore| Ravi from Bangalore|
|Sneha|Hyderabad|Sneha from Hyderabad|
| Amit|  Chennai|   Amit from Chennai|
+-----+---------+--------------------+



17. Use regexp_replace() to remove special characters from product names.

In [98]:
orders_clean_df = orders_df.withColumn(
    "CleanProduct",
    regexp_replace("Product", r"[^a-zA-Z0-9\s]", "")
)

orders_clean_df.select("Product", "CleanProduct").show()

+---------+------------+
|  Product|CleanProduct|
+---------+------------+
|   Laptop|      Laptop|
|    Mouse|       Mouse|
|   Tablet|      Tablet|
|Bookshelf|   Bookshelf|
|    Mixer|       Mixer|
| Notebook|    Notebook|
|    Phone|       Phone|
+---------+------------+



18. Use to_date() and datediff() \
to calculate customer age in days (fromSignupDate to today).

In [102]:
customers_age_df = customers_df.withColumn(
    "SignupDate", to_date("SignupDate", "yyyy-MM-dd")).withColumn(
    "AgeInDays", datediff(current_date(), "SignupDate")
)

customers_age_df.select("Name", "SignupDate", "AgeInDays").show()

+-----+----------+---------+
| Name|SignupDate|AgeInDays|
+-----+----------+---------+
|  Ali|2022-05-10|     1121|
| Neha|2023-01-15|      871|
| Ravi|2021-11-01|     1311|
|Sneha|2020-07-22|     1778|
| Amit|2023-03-10|      817|
+-----+----------+---------+

