In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# if 'spark' in locals() and isinstance(spark, SparkSession):
#     print("Stopping existing SparkSession...")
#     spark.stop()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

1. Data Ingestion & Exploration

In [31]:
# Load both CSV files with schema inference.
customers_df=spark.read.csv("/content/drive/MyDrive/customers.csv",header=True,inferSchema=True)
orders_df=spark.read.csv("/content/drive/MyDrive/orders.csv",header=True,inferSchema=True)

In [6]:
# List all columns and data types.
print("Customer Schema:")
customers_df.printSchema()
print("Order Schema:")
orders_df.printSchema()

Customer Schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

Order Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)



In [7]:
# Count the total number of customers and orders.
print("Total number of customers:",customers_df.count())
print("Total number of orders:",orders_df.count())

Total number of customers: 5
Total number of orders: 7


In [8]:
# Show distinct cities.
print("Distinct cities:")
customers_df.select("city").distinct().show()

Distinct cities:
+---------+
|     city|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



2. DataFrame Transformations

In [9]:
# Add a column TotalAmount = Price * Quantity .
from pyspark.sql.functions import col
orders_df=orders_df.withColumn("TotalAmount",col("Price")*col("Quantity"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



In [11]:
# Create a new column OrderYear from OrderDate .
from pyspark.sql.functions import year,to_date
orders_df=orders_df.withColumn("OrderDate",to_date("OrderDate","yyyy-MM-dd"))
orders_df=orders_df.withColumn("OrderYear",year(col("OrderDate")))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [12]:
# Filter orders with TotalAmount > 10,000 .
orders_df.filter(col("TotalAmount")>10000).show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+



In [13]:
# Drop the Email column from customers
customers_df=customers_df.drop("Email")
customers_df.show()

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Chennai|2023-03-10|
+----------+-----+---------+----------+



3. Handling Nulls & Conditionals

In [14]:
# Simulate a null in City and fill it with “Unknown”.
from pyspark.sql.functions import when
customers_df=customers_df.withColumn("City",when(col("CustomerID")==105,None).otherwise(col("City")))
customers_df=customers_df.fillna({"City":"Unknown"})
customers_df.show()

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Unknown|2023-03-10|
+----------+-----+---------+----------+



In [15]:
# Label customers as “Loyal” if SignupDate is before 2022, else “New”.
from pyspark.sql.functions import to_date, lit
customers_df=customers_df.withColumn("SignupDate",to_date("SignupDate","yyyy-MM-dd"))
customers_df=customers_df.withColumn("CustomerType",when(col("SignupDate")<lit("2022-01-01"),"Loyal").otherwise("New"))
customers_df.show()

+----------+-----+---------+----------+------------+
|CustomerID| Name|     City|SignupDate|CustomerType|
+----------+-----+---------+----------+------------+
|       101|  Ali|   Mumbai|2022-05-10|         New|
|       102| Neha|    Delhi|2023-01-15|         New|
|       103| Ravi|Bangalore|2021-11-01|       Loyal|
|       104|Sneha|Hyderabad|2020-07-22|       Loyal|
|       105| Amit|  Unknown|2023-03-10|         New|
+----------+-----+---------+----------+------------+



In [16]:
# Create OrderType column: "Low" if < 􀀀5,000, "High" if ≥ 􀀀5,000.
orders_df=orders_df.withColumn("OrderType",when(col("TotalAmount")<5000,"Low").otherwise("High"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|      Low|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|     High|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|      Low|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|   

4. Joins & Aggregations

In [17]:
# Join customers and orders on CustomerID .
joined_df=customers_df.join(orders_df,on="CustomerID",how="inner")
joined_df.show()

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|       101|  Ali|   Mumbai|2022-05-10|         New|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|
|       101|  Ali|   Mumbai|2022-05-10|         New|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|
|       102| Neha|    Delhi|2023-01-15|         New|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|       103| Ravi|Bangalore|2021-11-01|       Loyal|      4|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|      Low|

In [19]:
# Get total orders and revenue per city.
from pyspark.sql.functions import count, sum
joined_df.groupBy("City").agg(count("OrderID").alias("TotalOrders"),sum("TotalAmount").alias("TotalRevenue")).show()

+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|Bangalore|          1|      3500.0|
|   Mumbai|          2|    101200.0|
|  Unknown|          1|      2500.0|
|    Delhi|          2|     50000.0|
|Hyderabad|          1|      5000.0|
+---------+-----------+------------+



In [20]:
# Show top 3 customers by total spend.
from pyspark.sql.functions import desc
joined_df.groupBy("CustomerID").agg(sum("TotalAmount").alias("TotalSpend")).orderBy(desc("TotalSpend")).limit(3).show()


+----------+----------+
|CustomerID|TotalSpend|
+----------+----------+
|       101|  101200.0|
|       102|   50000.0|
|       104|    5000.0|
+----------+----------+



In [21]:
# Count how many products each category has sold.
joined_df.groupBy("Category").agg(sum("Quantity").alias("TotalSold")).show()

+-----------+---------+
|   Category|TotalSold|
+-----------+---------+
| Stationery|        5|
|Electronics|        5|
|  Furniture|        1|
| Appliances|        1|
+-----------+---------+



5. Spark SQL Tasks

In [23]:
# Create database sales and switch to it.
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

DataFrame[]

In [32]:
# Save both datasets as tables in the sales database.
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

In [25]:
# Write SQL to:
# List all orders by customers from “Delhi”.
spark.sql("""SELECT o.*
    FROM sales.orders o
    JOIN sales.customers c ON o.CustomerID = c.CustomerID
    WHERE c.City = 'Delhi'""").show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+



In [29]:
# Write SQL to:
# Find average order value in each category.
spark.sql("""SELECT Category, AVG(TotalAmount) AS AvgOrderValue
    FROM sales.orders
    GROUP BY Category""").show()

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      37800.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+



In [28]:
# Write SQL to:
# Create a view monthly_orders with month-wise total amount.
spark.sql("""CREATE OR REPLACE VIEW sales.monthly_orders AS
    SELECT MONTH(OrderDate) AS Month, SUM(TotalAmount) AS TotalAmount
    FROM sales.orders
    GROUP BY MONTH(OrderDate)""")
spark.sql("SELECT * FROM monthly_orders").show()

+-----+-----------+
|Month|TotalAmount|
+-----+-----------+
|    1|   101200.0|
|    3|    32500.0|
|    2|    28500.0|
+-----+-----------+



6. String & Date Functions

In [40]:
# Mask emails using regex.
from pyspark.sql.functions import regexp_extract,lit,concat,col
masked_df=customers_df.withColumn("MaskedEmail",concat(regexp_extract(col("Email"),r'^(.).*@',1),lit("***"),regexp_extract(col("Email"),r'@.*',0)))
masked_df.show(truncate=False)

+----------+-----+-----------------+---------+----------+----------------+
|CustomerID|Name |Email            |City     |SignupDate|MaskedEmail     |
+----------+-----+-----------------+---------+----------+----------------+
|101       |Ali  |ali@gmail.com    |Mumbai   |2022-05-10|a***@gmail.com  |
|102       |Neha |neha@yahoo.com   |Delhi    |2023-01-15|n***@yahoo.com  |
|103       |Ravi |ravi@hotmail.com |Bangalore|2021-11-01|r***@hotmail.com|
|104       |Sneha|sneha@outlook.com|Hyderabad|2020-07-22|s***@outlook.com|
|105       |Amit |amit@gmail.com   |Chennai  |2023-03-10|a***@gmail.com  |
+----------+-----+-----------------+---------+----------+----------------+



In [41]:
# Concatenate Name and City as “Name from City”.
from pyspark.sql.functions import concat_ws
concat_df=customers_df.withColumn("NameFromCity",concat_ws(" from ",col("Name"),col("City")))
concat_df.show()

+----------+-----+-----------------+---------+----------+--------------------+
|CustomerID| Name|            Email|     City|SignupDate|        NameFromCity|
+----------+-----+-----------------+---------+----------+--------------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     Ali from Mumbai|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|     Neha from Delhi|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01| Ravi from Bangalore|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|Sneha from Hyderabad|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|   Amit from Chennai|
+----------+-----+-----------------+---------+----------+--------------------+



In [42]:
# Use datediff() to calculate customer age in days.
from pyspark.sql.functions import datediff,to_date,current_date
customers_df=customers_df.withColumn("SignupDate",to_date("SignupDate"))
customers_df=customers_df.withColumn("AgeInDays",datediff(current_date(),col("SignupDate"))).show()

+----------+-----+-----------------+---------+----------+---------+
|CustomerID| Name|            Email|     City|SignupDate|AgeInDays|
+----------+-----+-----------------+---------+----------+---------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     1126|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|      876|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|     1316|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|     1783|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|      822|
+----------+-----+-----------------+---------+----------+---------+



In [44]:
# Extract month name from OrderDate .
from pyspark.sql.functions import date_format
orders_df=orders_df.withColumn("MonthName",date_format("OrderDate","MMMM"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|MonthName|
+-------+----------+---------+-----------+--------+-------+----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|  January|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|  January|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01| February|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10| February|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15| February|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|    March|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    March|
+-------+----------+---------+-----------+--------+-------+----------+---------+



7. UDFs and Complex Logic

In [46]:
#Write a UDF to tag customers:
# “Gold” if spend > 􀀀50K, “Silver” if 10K–50K, “Bronze” if <10K.
orders_df = orders_df.withColumn("TotalAmount", orders_df["Quantity"] * orders_df["Price"])
spend_df = orders_df.groupBy("CustomerID").sum("TotalAmount").withColumnRenamed("sum(TotalAmount)", "TotalSpend")

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def loyalty(spend):
    if spend>50000:
        return "Gold"
    elif 10000<=spend<=50000:
        return "Silver"
    else:
        return "Bronze"

loyalty_udf = udf(loyalty, StringType())
tagged=spend_df.withColumn("Loyalty",loyalty_udf(col("TotalSpend"))).show()

+----------+----------+-------+
|CustomerID|TotalSpend|Loyalty|
+----------+----------+-------+
|       101|  101200.0|   Gold|
|       103|    3500.0| Bronze|
|       102|   50000.0| Silver|
|       105|    2500.0| Bronze|
|       104|    5000.0| Bronze|
+----------+----------+-------+



In [48]:
# Write a UDF to shorten product names (first 3 letters + ...).
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def shorten_name(name):
    if len(name) > 3:
        return name[:3] + "..."
    else:
        return name

shorten_udf = udf(shorten_name, StringType())
orders_df.withColumn("ShortProduct", shorten_udf(col("Product"))).show()

+-------+----------+---------+-----------+--------+-------+----------+---------+-----------+------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|MonthName|TotalAmount|ShortProduct|
+-------+----------+---------+-----------+--------+-------+----------+---------+-----------+------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|  January|   100000.0|      Lap...|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|  January|     1200.0|      Mou...|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01| February|    20000.0|      Tab...|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10| February|     3500.0|      Boo...|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15| February|     5000.0|      Mix...|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|    March|     2500.0|      Not...|
|      7|       102|    Phone|Electronics|    

8. Parquet & Views

In [49]:
# Save the joined result as a Parquet file.
joined_df.write.mode("overwrite").parquet("/content/drive/MyDrive/joined_data.parquet")

In [51]:
# Read it back and verify schema.
parquet_df=spark.read.parquet("/content/drive/MyDrive/joined_data.parquet")
parquet_df.printSchema()
parquet_df.show()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|   

In [55]:
# Create and query a global temp view.
parquet_df.createOrReplaceGlobalTempView("global_orders")
spark.sql("SELECT * FROM global_temp.global_orders").show()

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|       101|  Ali|   Mumbai|2022-05-10|         New|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|
|       101|  Ali|   Mumbai|2022-05-10|         New|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|
|       102| Neha|    Delhi|2023-01-15|         New|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|       103| Ravi|Bangalore|2021-11-01|       Loyal|      4|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|      Low|

In [61]:
# Compare performance between CSV read and Parquet read.
import time
start_csv=time.time()
csv_df=spark.read.csv("/content/drive/MyDrive/orders.csv",header=True)
print("CSV read time:",time.time()-start_csv)
start_parquet=time.time()
parquet_df=spark.read.parquet("/content/drive/MyDrive/joined_data.parquet")
print("Parquet read time:",time.time()-start_parquet)


CSV read time: 0.27532172203063965
Parquet read time: 0.31235337257385254
