In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, when, year, lit, udf, concat_ws, regexp_replace, to_date, datediff
from pyspark.sql.types import StringType
spark =SparkSession.builder.appName("PracticeProject").enableHiveSupport().getOrCreate()
# Customers Data

spark.sql("CREATE DATABASE IF NOT EXISTS sales")


customers_data = [
(101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
(102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
(103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
(104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
(105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]
orders_data = [
(1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
(2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
(3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
(4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
(5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
(6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
(7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]
customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email","City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product","Category", "Quantity", "Price", "OrderDate"])
# Write as tables
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

In [26]:
# SECTION A: PySpark DataFrame Tasks
# 1. Add a column TotalAmount = Price * Quantity to the orders_df .
# 2. Filter all orders with TotalAmount > 10000 .
# 3. Standardize the City field in customers_df (e.g., lowercase).

# 4. Extract year from OrderDate and add a new column OrderYear .
# 5. Fill null values in any column of your choice with defaults.
# 6. Use when/otherwise to categorize orders:
# <5000 : "Low"
# 5000-20000 : "Medium"
# >20000 : "High"


orders_df=orders_df.withColumn("TotalAmount",orders_df.Price*orders_df.Quantity)
orders_df.show()

orders_df.filter(orders_df.TotalAmount>10000).show()

customers_df=customers_df.withColumn("City",expr("lower(City)"))

orders_df=orders_df.withColumn("OrderYear",year(orders_df.OrderDate))
orders_df.show()

customer_df=customers_df.fillna({'City':'Unknown'})
orders_df.show()

orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders_df=orders_df.withColumn("OrderCategory",
                               when((orders_df.TotalAmount) < 5000,"Low")
                               .when((orders_df.TotalAmount)<=20000,"Medium")
                               .otherwise("High")
                               )
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+---------+-----------+-------------+------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|OrderYear|TotalAmount|OrderCategory|CleanProduct|
+-------+----------+---------+-----------+--------+-------+----------+---------+-----------+-------------+------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|     2024|   100000.0|         High|      Laptop|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     2024|     1200.0|          Low|       Mouse|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|     2024|    20000.0|       Medium|      Tablet|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     2024|     3500.0|          Low|   Bookshelf|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     2024|     5000.0|       Medium|       Mixer|
|      6|       105| Notebook| Stationer

In [21]:
# SECTION B: Spark SQL Tasks
# 7. Run a SQL query to list all orders made by “Ali”.
# 8. Get total spending by each customer using SQL.
# 9. Find out which category made the highest total revenue.
# 10. Create a view customer_orders showing CustomerName, Product, TotalAmount .
# 11. Query the view for products ordered after Feb 2024.

orders_df.createOrReplaceTempView("orders")
customers_df.createOrReplaceTempView("customers")

spark.sql("""
SELECT o.* FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE c.Name = 'Ali'
""").show()

spark.sql("""
SELECT CustomerID, SUM(Price * Quantity) AS TotalSpending
FROM orders
GROUP BY CustomerID
""").show()

spark.sql("""
SELECT Category, SUM(Price * Quantity) AS Revenue
FROM orders
GROUP BY Category
ORDER BY Revenue DESC
LIMIT 1
""").show()

spark.sql("""
CREATE OR REPLACE TEMP VIEW customer_orders AS
SELECT c.Name AS CustomerName, o.Product, (o.Price * o.Quantity) AS TotalAmount
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
""")

spark.sql("""
SELECT * FROM customer_orders
WHERE TotalAmount IS NOT NULL
AND Product IN (
  SELECT Product FROM orders WHERE OrderDate > '2024-02-01'
)
""").show()



+-------+----------+-------+-----------+--------+-------+----------+---------+-----------+-------------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|OrderYear|TotalAmount|OrderCategory|
+-------+----------+-------+-----------+--------+-------+----------+---------+-----------+-------------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|     2024|   100000.0|         High|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|     2024|     1200.0|          Low|
+-------+----------+-------+-----------+--------+-------+----------+---------+-----------+-------------+

+----------+-------------+
|CustomerID|TotalSpending|
+----------+-------------+
|       101|     101200.0|
|       102|      50000.0|
|       103|       3500.0|
|       104|       5000.0|
|       105|       2500.0|
+----------+-------------+

+-----------+--------+
|   Category| Revenue|
+-----------+--------+
|Electronics|151200.0|
+-----------+--------+

+--------

In [29]:
# SECTION C: Advanced Practice
# 12. Create a Global Temp View from customers_df , then query it using:
# SELECT * FROM global_temp.customers WHERE City = 'Mumbai';
# 13. Save the transformed orders_df (with TotalAmount) to a Parquet file.
# 14. Read back the Parquet file and count how many orders are in it.

customers_df.createOrReplaceGlobalTempView("customers")
spark.sql("SELECT * FROM global_temp.customers WHERE City = 'mumbai'").show()


orders_df.write.mode("overwrite").parquet("parquet/total_orders")


read_df = spark.read.parquet("parquet/total_orders")
print(read_df.count())


+----------+----+-------------+------+----------+--------------+---------------+-----------------+
|CustomerID|Name|        Email|  City|SignupDate|   MaskedEmail|          Label|CustomerAgeInDays|
+----------+----+-------------+------+----------+--------------+---------------+-----------------+
|       101| Ali|ali@gmail.com|mumbai|2022-05-10|a***@gmail.com|Ali from mumbai|             1121|
+----------+----+-------------+------+----------+--------------+---------------+-----------------+

7


In [28]:
# SECTION D: UDF + Built-in Function Tasks
# 15. Write a UDF that masks emails like: ali@gmail.com → a***@gmail.com .
# 16. Use concat_ws() to create a full label like: 'Ali from Mumbai' .
# 17. Use regexp_replace() to remove special characters from product names.
# 18. Use to_date() and datediff() to calculate customer age in days (from
# SignupDate to today).

import datetime
from pyspark.sql.functions import col, to_date, datediff, lit

def Email(email):
    parts = email.split("@")
    return parts[0][0] + "***@" + parts[1]

mask_email_udf = udf(Email, StringType())

customers_df = customers_df.withColumn("MaskedEmail", mask_email_udf(col("Email")))

customers_df = customers_df.withColumn("Label", concat_ws(" from ", col("Name"), col("City")))

orders_df = orders_df.withColumn("CleanProduct", regexp_replace(col("Product"), "[^a-zA-Z0-9]", ""))

customers_df = customers_df.withColumn("SignupDate", to_date(col("SignupDate")))
today = datetime.date.today().strftime("%Y-%m-%d")

customers_df = customers_df.withColumn("CustomerAgeInDays", datediff(lit(today), col("SignupDate")))

orders_df.show()
customer_df.show()


+-------+----------+---------+-----------+--------+-------+----------+---------+-----------+-------------+------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|OrderYear|TotalAmount|OrderCategory|CleanProduct|
+-------+----------+---------+-----------+--------+-------+----------+---------+-----------+-------------+------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|     2024|   100000.0|         High|      Laptop|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     2024|     1200.0|          Low|       Mouse|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|     2024|    20000.0|       Medium|      Tablet|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     2024|     3500.0|          Low|   Bookshelf|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     2024|     5000.0|       Medium|       Mixer|
|      6|       105| Notebook| Stationer