**Intialize the Spark Session**

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
        .appName("PracticeProject")\
        .enableHiveSupport()\
        .getOrCreate()
spark


**Data Preparation**

In [15]:
# Customer Data
customers_data = [
    (101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
    (102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
    (103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
    (104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
    (105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]
# Orders Data
orders_data = [
    (1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
    (2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
    (3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
    (4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
    (5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
    (6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
    (7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]
customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])
#create sales schema
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
# save as tables
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

**SECTION A: PySpark DataFrame Tasks**

In [16]:
#1.Add TotalAmount = Price * Quantity
orders_df =orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
print("Step 1: TotalAmount column added")
orders_df.show()
#2.Filter orders where TotalAmount > 10000
print("Orders where TotalAmount > 10000")
orders_df.filter(col("TotalAmount") > 10000).show()
#3.Standardize City field (lowercase)
customers_df =customers_df.withColumn("City", lower(col("City")))
print("Customers with lowercase city names")
customers_df.show()
#4.Extract OrderYear
orders_df =orders_df.withColumn("OrderYear", year(col("OrderDate")))
print("Orders with OrderYear extracted")
orders_df.show()
#5.Fill nulls in any column
orders_df =orders_df.fillna({"Category": "Misc"})
print("Orders after filling nulls in Category with 'Misc'")
orders_df.show()
#6.Categorize orders using when/otherwise
orders_df =orders_df.withColumn("OrderCategory",
    when(col("TotalAmount") < 5000, "Low")
    .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
    .otherwise("High")
)
print("Orders categorized by TotalAmount")
orders_df.show()


Step 1: TotalAmount column added
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+

Orders where TotalAmount > 10000
+-------+----------+

**SECTION B: Spark SQL Tasks**

In [17]:
#7.Orders made by "Ali"
print("Orders made by Ali:")
spark.sql("""
    SELECT o.* FROM sales.orders o
    JOIN sales.customers c ON o.CustomerID = c.CustomerID
    WHERE c.Name = 'Ali'
""").show()
#8.Total spending per customer
print("Total spending per customer:")
spark.sql("""
    SELECT c.Name, SUM(o.Price * o.Quantity) AS TotalSpent
    FROM sales.orders o
    JOIN sales.customers c ON o.CustomerID = c.CustomerID
    GROUP BY c.Name
""").show()
#9.Category with highest total revenue
print("Category with highest total revenue:")
spark.sql("""
    SELECT Category, SUM(Price * Quantity) AS Revenue
    FROM sales.orders
    GROUP BY Category
    ORDER BY Revenue DESC
    LIMIT 1
""").show()
#10.Create view customer_orders
print("View customer_orders created")
spark.sql("""
    CREATE OR REPLACE VIEW customer_orders AS
    SELECT c.Name AS CustomerName, o.Product, o.Price * o.Quantity AS TotalAmount
    FROM sales.orders o
    JOIN sales.customers c ON o.CustomerID = c.CustomerID
""")
#11.Query for orders after Feb 2024
print("Orders after Feb 2024:")
spark.sql("""
    SELECT * FROM customer_orders
    WHERE TotalAmount IS NOT NULL AND Product IN (
        SELECT Product FROM sales.orders WHERE OrderDate > '2024-02-01'
    )
""").show()


Orders made by Ali:
+-------+----------+-------+-----------+--------+-------+----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+-------+-----------+--------+-------+----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|
+-------+----------+-------+-----------+--------+-------+----------+

Total spending per customer:
+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Neha|   50000.0|
|  Ali|  101200.0|
| Ravi|    3500.0|
|Sneha|    5000.0|
| Amit|    2500.0|
+-----+----------+

Category with highest total revenue:
+-----------+--------+
|   Category| Revenue|
+-----------+--------+
|Electronics|151200.0|
+-----------+--------+

View customer_orders created
Orders after Feb 2024:
+------------+---------+-----------+
|CustomerName|  Product|TotalAmount|
+------------+---------+-----------+
|        Ravi|Bookshelf|     3500.0|
|       Sne

**SECTION C: Advanced Practice**

In [18]:
#12.Global Temp View
print("Global Temp View created")
customers_df.createOrReplaceGlobalTempView("customers")
spark.sql("SELECT * FROM global_temp.customers WHERE City = 'mumbai'").show()
#13.Save orders_df (with TotalAmount) to Parquet
orders_df.write.mode("overwrite").parquet("/tmp/orders_parquet")
print("Orders saved to Parquet")
#14.Read back and count
parquet_df = spark.read.parquet("/tmp/orders_parquet")
print("Total Orders:", parquet_df.count())

Global Temp View created
+----------+----+-------------+------+----------+
|CustomerID|Name|        Email|  City|SignupDate|
+----------+----+-------------+------+----------+
|       101| Ali|ali@gmail.com|mumbai|2022-05-10|
+----------+----+-------------+------+----------+

Orders saved to Parquet
Total Orders: 7


**SECTION D: UDF + Built-in Function Tasks**

In [19]:
from pyspark.sql.types import StringType
#15.UDF to mask email
def mask_email(email):
    name, domain = email.split("@")
    return name[0] + "***@" + domain
mask_email_udf = udf(mask_email, StringType())
customers_df = customers_df.withColumn("MaskedEmail", mask_email_udf(col("Email")))
print("Customers with masked emails:")
customers_df.show()
#16.concat_ws() to create full label like "Ali from Mumbai"
customers_df = customers_df.withColumn("Label", concat_ws(" ", col("Name"), lit("from"), col("City")))
print("Customers with full labels:")
customers_df.show()
# 17. regexp_replace to clean product names
orders_df = orders_df.withColumn("CleanProduct", regexp_replace("Product", "[^a-zA-Z0-9]", ""))
print("Orders with cleaned product names:")
orders_df.show()
#18.to_date and datediff to compute customer age (days since signup)
customers_df = customers_df.withColumn("SignupDate", to_date("SignupDate"))
customers_df = customers_df.withColumn("CustomerAgeDays", datediff(current_date(), col("SignupDate")))
print("Customers with age in days:")
customers_df.show()


Customers with masked emails:
+----------+-----+-----------------+---------+----------+----------------+
|CustomerID| Name|            Email|     City|SignupDate|     MaskedEmail|
+----------+-----+-----------------+---------+----------+----------------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|  a***@gmail.com|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|  n***@yahoo.com|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01|r***@hotmail.com|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|s***@outlook.com|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|  a***@gmail.com|
+----------+-----+-----------------+---------+----------+----------------+

Customers with full labels:
+----------+-----+-----------------+---------+----------+----------------+--------------------+
|CustomerID| Name|            Email|     City|SignupDate|     MaskedEmail|               Label|
+----------+-----+-----------------+---------+----------+----------------+

**Display of final customer and order data**

In [20]:
customers_df.show(truncate=False)
orders_df.show(truncate=False)

+----------+-----+-----------------+---------+----------+----------------+--------------------+---------------+
|CustomerID|Name |Email            |City     |SignupDate|MaskedEmail     |Label               |CustomerAgeDays|
+----------+-----+-----------------+---------+----------+----------------+--------------------+---------------+
|101       |Ali  |ali@gmail.com    |mumbai   |2022-05-10|a***@gmail.com  |Ali from mumbai     |1121           |
|102       |Neha |neha@yahoo.com   |delhi    |2023-01-15|n***@yahoo.com  |Neha from delhi     |871            |
|103       |Ravi |ravi@hotmail.com |bangalore|2021-11-01|r***@hotmail.com|Ravi from bangalore |1311           |
|104       |Sneha|sneha@outlook.com|hyderabad|2020-07-22|s***@outlook.com|Sneha from hyderabad|1778           |
|105       |Amit |amit@gmail.com   |chennai  |2023-03-10|a***@gmail.com  |Amit from chennai   |817            |
+----------+-----+-----------------+---------+----------+----------------+--------------------+---------