**Intialize the Spark Session**

In [1]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder\
    .appName("Master-Task-Set1")\
    .getOrCreate()
)
spark

**Load the Data**

In [2]:
customers = spark.read.option("header", True).option("inferSchema", True).csv("/content/customers.csv")
orders = spark.read.option("header", True).option("inferSchema", True).csv("/content/orders.csv")
customers.show()
orders.show()

+----------+-----+-----------------+---------+-------------------+
|CustomerID| Name|            Email|     City|        SignupDate |
+----------+-----+-----------------+---------+-------------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10 00:00:00|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15 00:00:00|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01 00:00:00|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22 00:00:00|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10 00:00:00|
+----------+-----+-----------------+---------+-------------------+

+--------+----------+---------+-----------+--------+-------+-------------------+
| OrderID|CustomerID|  Product|   Category|Quantity|  Price|         OrderDate |
+--------+----------+---------+-----------+--------+-------+-------------------+
|       1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10 00:00:00|
|       2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15 0

**Data Ingestion & Exploration**

In [3]:
# Schema
customers.printSchema()
orders.printSchema()
# Count records
print("Total Customers:", customers.count())
print("Total Orders:", orders.count())
# Distinct cities
customers.select("City").distinct().show()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate : timestamp (nullable = true)

root
 |--  OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate : timestamp (nullable = true)

Total Customers: 5
Total Orders: 7
+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



**DataFrame Transformations**

In [29]:
#1.Add a column TotalAmount = Price * Quantity
orders = orders.withColumn("TotalAmount", orders["Price"] * orders["Quantity"])
orders.show()
#2.Create a new column OrderYear from OrderDate
from pyspark.sql.functions import year
# Rename the column to remove the trailing space
orders = orders.withColumnRenamed("OrderDate ", "OrderDate")
orders = orders.withColumn("OrderYear", year(orders["OrderDate"]))
orders.show()
#3.Filter orders with TotalAmount > 10,000
orders = orders.filter(orders["TotalAmount"] > 10000)
orders.show()
#3.Drop the Email column from customers
customers = customers.drop("Email")
customers.show()

+--------+----------+-------+-----------+--------+-------+-------------------+-----------+---------+
| OrderID|CustomerID|Product|   Category|Quantity|  Price|         OrderDate |TotalAmount|OrderType|
+--------+----------+-------+-----------+--------+-------+-------------------+-----------+---------+
|       1|       101| Laptop|Electronics|       2|50000.0|2024-01-10 00:00:00|   100000.0|     High|
|       3|       102| Tablet|Electronics|       1|20000.0|2024-02-01 00:00:00|    20000.0|     High|
|       7|       102|  Phone|Electronics|       1|30000.0|2024-03-02 00:00:00|    30000.0|     High|
+--------+----------+-------+-----------+--------+-------+-------------------+-----------+---------+

+--------+----------+-------+-----------+--------+-------+-------------------+-----------+---------+---------+
| OrderID|CustomerID|Product|   Category|Quantity|  Price|          OrderDate|TotalAmount|OrderType|OrderYear|
+--------+----------+-------+-----------+--------+-------+------------

**Handling Nulls & Conditionals**

In [27]:
from pyspark.sql.functions import when, col
#1.Simulate and fill nulls
customers = customers.withColumn("City", when(col("CustomerID") == 105, None).otherwise(col("City")))
customers = customers.fillna({"City": "Unknown"})
customers.show()
#Rename the column to remove the trailing space
customers = customers.withColumnRenamed("SignupDate ", "SignupDate")
customers.show()
#2.Label customers by SignupDate
customers = customers.withColumn("CustomerType", when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New"))
customers.show()
#3.OrderType by TotalAmount
orders = orders.withColumn("OrderType", when(col("TotalAmount") < 5000, "Low").otherwise("High"))
orders.show()

+----------+-----+---------+-------------------+------------+
|CustomerID| Name|     City|         SignupDate|CustomerType|
+----------+-----+---------+-------------------+------------+
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|         New|
|       102| Neha|    Delhi|2023-01-15 00:00:00|         New|
|       103| Ravi|Bangalore|2021-11-01 00:00:00|       Loyal|
|       104|Sneha|Hyderabad|2020-07-22 00:00:00|       Loyal|
|       105| Amit|  Unknown|2023-03-10 00:00:00|         New|
+----------+-----+---------+-------------------+------------+

+----------+-----+---------+-------------------+------------+
|CustomerID| Name|     City|         SignupDate|CustomerType|
+----------+-----+---------+-------------------+------------+
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|         New|
|       102| Neha|    Delhi|2023-01-15 00:00:00|         New|
|       103| Ravi|Bangalore|2021-11-01 00:00:00|       Loyal|
|       104|Sneha|Hyderabad|2020-07-22 00:00:00|       Loyal|
|      

**Joins & Aggregations**

In [31]:
#1.Join customers and orders on CustomerID
joined_df = customers.join(orders, on="CustomerID", how="inner")
joined_df.show()
# Rename the ' OrderID' column to 'OrderID' after the join
joined_df = joined_df.withColumnRenamed(" OrderID", "OrderID")
#2.Get total orders and revenue per city
revenue = joined_df.groupBy("City").agg({"TotalAmount": "sum", "OrderID": "count"})
revenue
#3.Show top 3 customers by total spend
top_customers = joined_df.groupBy("CustomerID").agg({"TotalAmount": "sum"}).orderBy("sum(TotalAmount)", ascending=False).limit(3)
top_customers.show()
#4.Count how many products each category has sold
product_sales = joined_df.groupBy("Category").agg({"Quantity": "sum"})
product_sales.show()

+----------+----+------+-------------------+------------+--------+-------+-----------+--------+-------+-------------------+-----------+---------+---------+
|CustomerID|Name|  City|         SignupDate|CustomerType| OrderID|Product|   Category|Quantity|  Price|          OrderDate|TotalAmount|OrderType|OrderYear|
+----------+----+------+-------------------+------------+--------+-------+-----------+--------+-------+-------------------+-----------+---------+---------+
|       101| Ali|Mumbai|2022-05-10 00:00:00|         New|       1| Laptop|Electronics|       2|50000.0|2024-01-10 00:00:00|   100000.0|     High|     2024|
|       102|Neha| Delhi|2023-01-15 00:00:00|         New|       3| Tablet|Electronics|       1|20000.0|2024-02-01 00:00:00|    20000.0|     High|     2024|
|       102|Neha| Delhi|2023-01-15 00:00:00|         New|       7|  Phone|Electronics|       1|30000.0|2024-03-02 00:00:00|    30000.0|     High|     2024|
+----------+----+------+-------------------+------------+-------

**Spark SQL Tasks**

In [34]:
#1.Create sales DB and set context
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.catalog.setCurrentDatabase("sales")
#2.Save tables
customers.write.mode("overwrite").saveAsTable("sales.customers")
orders.write.mode("overwrite").saveAsTable("sales.orders")
#3.SQL Queries
spark.sql("""
    SELECT * FROM sales.orders o
    JOIN sales.customers c ON o.CustomerID = c.CustomerID
    WHERE c.City = 'Delhi'
""").show()

spark.sql("""
    SELECT Category, AVG(Price * Quantity) AS AvgValue
    FROM sales.orders
    GROUP BY Category
""").show()

#Create monthly view
from pyspark.sql.functions import month, sum
orders.withColumn("Month", month("OrderDate")) \
      .groupBy("Month") \
      .agg(sum("TotalAmount").alias("MonthlyTotal")) \
      .createOrReplaceTempView("monthly_orders")

spark.sql("SELECT * FROM monthly_orders").show()


+--------+----------+-------+-----------+--------+-------+-------------------+-----------+---------+---------+----------+----+-----+-------------------+------------+
| OrderID|CustomerID|Product|   Category|Quantity|  Price|          OrderDate|TotalAmount|OrderType|OrderYear|CustomerID|Name| City|         SignupDate|CustomerType|
+--------+----------+-------+-----------+--------+-------+-------------------+-----------+---------+---------+----------+----+-----+-------------------+------------+
|       3|       102| Tablet|Electronics|       1|20000.0|2024-02-01 00:00:00|    20000.0|     High|     2024|       102|Neha|Delhi|2023-01-15 00:00:00|         New|
|       7|       102|  Phone|Electronics|       1|30000.0|2024-03-02 00:00:00|    30000.0|     High|     2024|       102|Neha|Delhi|2023-01-15 00:00:00|         New|
+--------+----------+-------+-----------+--------+-------+-------------------+-----------+---------+---------+----------+----+-----+-------------------+------------+

+--

**String & Date Functions**

In [41]:
# Reload to get Email again
from pyspark.sql.functions import regexp_replace, concat_ws, col, lit, to_date, current_date, datediff, date_format
customers = spark.read.option("header", True).option("inferSchema", True).csv("customers.csv")
customers.show
# Rename the column to remove the trailing space after reloading
customers = customers.withColumnRenamed("SignupDate ", "SignupDate")
customers.show()
#1.Mask email using regex
customers = customers.withColumn("MaskedEmail", regexp_replace("Email", "(.).+(@.+)", "$1***$2"))
customers.show()
#2.Concatenate label
customers = customers.withColumn("Label", concat_ws(" ", col("Name"), lit("from"), col("City")))
customers.show()
#3.Calculate customer age in days
customers = customers.withColumn("SignupDate", to_date("SignupDate")) \
                     .withColumn("CustomerAgeDays", datediff(current_date(), col("SignupDate")))
customers.show()
#4.Month name from OrderDate
orders = orders.withColumn("OrderMonth", date_format("OrderDate", "MMMM"))
orders.show()

+----------+-----+-----------------+---------+-------------------+
|CustomerID| Name|            Email|     City|         SignupDate|
+----------+-----+-----------------+---------+-------------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10 00:00:00|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15 00:00:00|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01 00:00:00|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22 00:00:00|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10 00:00:00|
+----------+-----+-----------------+---------+-------------------+

+----------+-----+-----------------+---------+-------------------+----------------+
|CustomerID| Name|            Email|     City|         SignupDate|     MaskedEmail|
+----------+-----+-----------------+---------+-------------------+----------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10 00:00:00|  a***@gmail.com|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15 00:

**UDFs and Complex Logic**

In [42]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
# UDF to tag customers
def tag_customer(spend):
    if spend is None: return "None"
    elif spend > 50000: return "Gold"
    elif spend >= 10000: return "Silver"
    else: return "Bronze"
tag_udf = udf(tag_customer, StringType())
# UDF to shorten product names
def shorten_product(name):
    return name[:3] + "..." if name else ""
shorten_udf = udf(shorten_product, StringType())
# Compute spend per customer and apply tag
customer_spend = orders.groupBy("CustomerID").agg(sum("TotalAmount").alias("TotalSpend"))
customers = customers.join(customer_spend, on="CustomerID", how="left") \
                     .withColumn("Tier", tag_udf("TotalSpend"))
customers.show()
# Shorten product names
orders = orders.withColumn("ShortProduct", shorten_udf("Product"))
orders.show()


+----------+-----+-----------------+---------+----------+----------------+--------------------+---------------+----------+------+
|CustomerID| Name|            Email|     City|SignupDate|     MaskedEmail|               Label|CustomerAgeDays|TotalSpend|  Tier|
+----------+-----+-----------------+---------+----------+----------------+--------------------+---------------+----------+------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|  a***@gmail.com|     Ali from Mumbai|           1126|  100000.0|  Gold|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|  n***@yahoo.com|     Neha from Delhi|            876|   50000.0|Silver|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|r***@hotmail.com| Ravi from Bangalore|           1316|      NULL|  None|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|s***@outlook.com|Sneha from Hyderabad|           1783|      NULL|  None|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|  a***@gmail.com|   Amit from Che

**Parquet & Views**

In [46]:
# Save as Parquet
joined_df.write.mode("overwrite").parquet("/tmp/joined_orders")
# Read back
parquet_df = spark.read.parquet("/tmp/joined_orders")
parquet_df.printSchema()
# Global temp view
parquet_df.createOrReplaceGlobalTempView("global_joined")
spark.sql("SELECT * FROM global_temp.global_joined").show()
# Compare performance between CSV read and Parquet read
parquet_df.explain()
customers.explain()
import time
start_csv = time.time()
_ = spark.read.option("header", True).csv("orders.csv").count()
end_csv = time.time()
start_parquet = time.time()
_ = spark.read.parquet("/tmp/joined_orders").count()
end_parquet = time.time()
print("CSV Read Time:", end_csv - start_csv)
print("Parquet Read Time:", end_parquet - start_parquet)

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: timestamp (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: timestamp (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- OrderYear: integer (nullable = true)

+----------+----+------+-------------------+------------+-------+-------+-----------+--------+-------+-------------------+-----------+---------+---------+
|CustomerID|Name|  City|         SignupDate|CustomerType|OrderID|Product|   Category|Quantity|  Price|          OrderDate|TotalAmount|OrderType|OrderYear|
+----------+----+------+-------------------+------------+-------+-------+-----------+--------+-------+-----------