Import Statements

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, when, col, count, sum, concat, lit, lower, concat_ws, datediff, current_date, to_date, date_format, udf
import time
from pyspark.sql.types import StringType
from google.colab import drive

Creating PySpark Session

In [None]:
spark = SparkSession.builder \
                    .appName("Assessment1") \
                    .getOrCreate()
spark

Mounting Google Drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##TASKS

### 1. Data Ingestion & Exploration

Load both CSV files with schema inference.

In [None]:
# Reading customer data
customer_df = spark.read.csv('/content/drive/MyDrive/PysparkAssessment/customers.csv', header=True, inferSchema=True)

# Reading order data
order_df =  spark.read.csv('/content/drive/MyDrive/PysparkAssessment/orders.csv', header=True, inferSchema=True)

 List all columns and data types.

In [None]:
print("\n Employee Data")
customer_df.printSchema()

print("\n Order Data")
order_df.printSchema()


 Employee Data
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate : timestamp (nullable = true)


 Order Data
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate : timestamp (nullable = true)



Count the total number of customers and orders.

In [None]:
print(f"Total Number of Customers: {customer_df.count()}")
print(f"Total Number of Orders: {order_df.count()}")

Total Number of Customers: 5
Total Number of Orders: 7


Show distinct cities.

In [None]:
customer_df.select("City") \
           .distinct() \
           .show()

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



### 2. DataFrame Transformation

Add a column TotalAmount = Price * Quantity

In [None]:
order_df = order_df.withColumn("TotalAmount", order_df.Price * order_df.Quantity)
order_df.select("OrderID", "CustomerID", "Product", "Quantity", "Price", "TotalAmount").show()

+-------+----------+---------+--------+-------+-----------+
|OrderID|CustomerID|  Product|Quantity|  Price|TotalAmount|
+-------+----------+---------+--------+-------+-----------+
|      1|       101|   Laptop|       2|50000.0|   100000.0|
|      2|       101|    Mouse|       1| 1200.0|     1200.0|
|      3|       102|   Tablet|       1|20000.0|    20000.0|
|      4|       103|Bookshelf|       1| 3500.0|     3500.0|
|      5|       104|    Mixer|       1| 5000.0|     5000.0|
|      6|       105| Notebook|       5|  500.0|     2500.0|
|      7|       102|    Phone|       1|30000.0|    30000.0|
+-------+----------+---------+--------+-------+-----------+



Create a new column OrderYear from OrderDate

In [None]:
order_df = order_df.withColumn("OrderYear", year(order_df.OrderDate))
order_df.select("OrderID", "CustomerID", "Product", "Quantity", "TotalAmount", "OrderDate", "OrderYear").show()

+-------+----------+---------+--------+-----------+-------------------+---------+
|OrderID|CustomerID|  Product|Quantity|TotalAmount|          OrderDate|OrderYear|
+-------+----------+---------+--------+-----------+-------------------+---------+
|      1|       101|   Laptop|       2|   100000.0|2024-01-10 00:00:00|     2024|
|      2|       101|    Mouse|       1|     1200.0|2024-01-15 00:00:00|     2024|
|      3|       102|   Tablet|       1|    20000.0|2024-02-01 00:00:00|     2024|
|      4|       103|Bookshelf|       1|     3500.0|2024-02-10 00:00:00|     2024|
|      5|       104|    Mixer|       1|     5000.0|2024-02-15 00:00:00|     2024|
|      6|       105| Notebook|       5|     2500.0|2024-03-01 00:00:00|     2024|
|      7|       102|    Phone|       1|    30000.0|2024-03-02 00:00:00|     2024|
+-------+----------+---------+--------+-----------+-------------------+---------+



Filter orders with TotalAmount > 10,000

In [None]:
order_df.filter(order_df.TotalAmount > 10000) \
        .select("OrderID", "CustomerID", "Product", "Quantity", "Price", "TotalAmount") \
        .show()

+-------+----------+-------+--------+-------+-----------+
|OrderID|CustomerID|Product|Quantity|  Price|TotalAmount|
+-------+----------+-------+--------+-------+-----------+
|      1|       101| Laptop|       2|50000.0|   100000.0|
|      3|       102| Tablet|       1|20000.0|    20000.0|
|      7|       102|  Phone|       1|30000.0|    30000.0|
+-------+----------+-------+--------+-------+-----------+



Drop the email columns from the customers

In [None]:
customer_df = customer_df.drop("Email")
customer_df.show()

+----------+-----+---------+-------------------+
|CustomerID| Name|     City|        SignupDate |
+----------+-----+---------+-------------------+
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|
|       102| Neha|    Delhi|2023-01-15 00:00:00|
|       103| Ravi|Bangalore|2021-11-01 00:00:00|
|       104|Sneha|Hyderabad|2020-07-22 00:00:00|
|       105| Amit|  Chennai|2023-03-10 00:00:00|
+----------+-----+---------+-------------------+



### 3. Handling Nulls & Conditionals

Simulate a null in City and fill it with “Unknown”.

In [None]:
customer_df = customer_df.fillna({"City": "Unknown"})
customer_df.show()

+----------+-----+---------+-------------------+
|CustomerID| Name|     City|        SignupDate |
+----------+-----+---------+-------------------+
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|
|       102| Neha|    Delhi|2023-01-15 00:00:00|
|       103| Ravi|Bangalore|2021-11-01 00:00:00|
|       104|Sneha|Hyderabad|2020-07-22 00:00:00|
|       105| Amit|  Chennai|2023-03-10 00:00:00|
+----------+-----+---------+-------------------+



 Label customers as “Loyal” if SignupDate is before 2022, else “New”.

In [None]:
customer_df = customer_df.withColumn(
                                      "Customer_type",
                                       when(year(col("SignupDate")) < 2022, "Loyal")
                                      .otherwise("New")
                                    )
customer_df.select("CustomerID", "Name", year("SignupDate").alias("Signup_year"), "Customer_type").show()

+----------+-----+-----------+-------------+
|CustomerID| Name|Signup_year|Customer_type|
+----------+-----+-----------+-------------+
|       101|  Ali|       2022|          New|
|       102| Neha|       2023|          New|
|       103| Ravi|       2021|        Loyal|
|       104|Sneha|       2020|        Loyal|
|       105| Amit|       2023|          New|
+----------+-----+-----------+-------------+



Create OrderType column: "Low" if < 5,000, "High" if ≥ 5,000.

In [None]:
order_df = order_df.withColumn("OrderType",
                                when(col("TotalAmount") < 5000, "Low")
                                .otherwise("High")
                              )
order_df.select("OrderID", "CustomerID", "Product", "TotalAmount", "OrderType").show()

+-------+----------+---------+-----------+---------+
|OrderID|CustomerID|  Product|TotalAmount|OrderType|
+-------+----------+---------+-----------+---------+
|      1|       101|   Laptop|   100000.0|     High|
|      2|       101|    Mouse|     1200.0|      Low|
|      3|       102|   Tablet|    20000.0|     High|
|      4|       103|Bookshelf|     3500.0|      Low|
|      5|       104|    Mixer|     5000.0|     High|
|      6|       105| Notebook|     2500.0|      Low|
|      7|       102|    Phone|    30000.0|     High|
+-------+----------+---------+-----------+---------+



### 4. Joins & Aggregation

Join customers and orders on CustomerID .


In [None]:
customer_order_df = customer_df.join(order_df, on="CustomerID", how="inner")
customer_order_df.show()

+----------+-----+---------+-------------------+-------------+-------+---------+-----------+--------+-------+-------------------+-----------+---------+---------+
|CustomerID| Name|     City|         SignupDate|Customer_type|OrderID|  Product|   Category|Quantity|  Price|          OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+-------------------+-------------+-------+---------+-----------+--------+-------+-------------------+-----------+---------+---------+
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|          New|      1|   Laptop|Electronics|       2|50000.0|2024-01-10 00:00:00|   100000.0|     2024|     High|
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|          New|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15 00:00:00|     1200.0|     2024|      Low|
|       102| Neha|    Delhi|2023-01-15 00:00:00|          New|      3|   Tablet|Electronics|       1|20000.0|2024-02-01 00:00:00|    20000.0|     2024|     High|
|       103| Ravi|Bangalore|

Get total orders and revenue per city.

In [None]:
customer_order_df.groupBy("City") \
                 .agg(
                      count("OrderID").alias("TotalOrders"),
                      sum("TotalAmount").alias("TotalRevenue")) \
                  .orderBy("TotalRevenue", ascending = False) \
                  .show()


+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|   Mumbai|          2|    101200.0|
|    Delhi|          2|     50000.0|
|Hyderabad|          1|      5000.0|
|Bangalore|          1|      3500.0|
|  Chennai|          1|      2500.0|
+---------+-----------+------------+



Show top 3 customers by total spend.

In [None]:
customer_order_df.groupBy("CustomerID") \
                 .agg(sum("TotalAmount").alias("Total_amount_spend")) \
                 .orderBy("TotalAmountSpend", ascending = False) \
                 .limit(3) \
                 .show()

+----------+----------------+
|CustomerID|TotalAmountSpend|
+----------+----------------+
|       101|        101200.0|
|       102|         50000.0|
|       104|          5000.0|
+----------+----------------+



Count how many products each category has sold.

In [None]:
customer_order_df.groupBy("Category") \
    .agg(sum("Quantity").alias("Total_products_sold")) \
    .orderBy("TotalProductsSold", ascending=False) \
    .show()

+-----------+-----------------+
|   Category|TotalProductsSold|
+-----------+-----------------+
| Stationery|                5|
|Electronics|                5|
|  Furniture|                1|
| Appliances|                1|
+-----------+-----------------+



### 5.  Spark SQL Tasks

Create database sales and switch to it.

In [None]:
spark.sql("""
          CREATE DATABASE IF NOT EXISTS sales
""")

spark.sql("USE sales")

DataFrame[]

save both datasets as tables in the sales database.

In [None]:
order_df.write.mode("overwrite") \
              .saveAsTable("sales.orders")

customer_df.write.mode("overwrite") \
                .saveAsTable("sales.customers")

 List all orders by customers from “Delhi”.

In [None]:
spark.sql("""
          SELECT
            c.CustomerID,
            o.OrderID,
            o.Product,
            o.Quantity,
            o.Price,
            o.TotalAmount,
            CAST(o.OrderDate AS DATE) AS OrderDate
          FROM sales.orders AS o
          JOIN sales.customers AS c
          ON o.CustomerID = c.CustomerID
          WHERE c.City = 'Delhi'
""").show()

+----------+-------+-------+--------+-------+-----------+----------+
|CustomerID|OrderID|Product|Quantity|  Price|TotalAmount| OrderDate|
+----------+-------+-------+--------+-------+-----------+----------+
|       102|      3| Tablet|       1|20000.0|    20000.0|2024-02-01|
|       102|      7|  Phone|       1|30000.0|    30000.0|2024-03-02|
+----------+-------+-------+--------+-------+-----------+----------+



Find average order value in each category.

In [None]:
spark.sql("""
          SELECT
            Category,
            AVG(TotalAmount) AS AvgOrderValue
          FROM sales.orders
          GROUP BY Category
          ORDER BY AvgOrderValue DESC
""").show()

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
|Electronics|      37800.0|
| Appliances|       5000.0|
|  Furniture|       3500.0|
| Stationery|       2500.0|
+-----------+-------------+



Create a view monthly_orders with month-wise total amount.

In [None]:
spark.sql("""
          CREATE OR REPLACE VIEW sales.monthly_orders AS
          SELECT
            MONTH(OrderDate) AS Month,
            SUM(TotalAmount) AS TotalAmount
          FROM sales.orders
          GROUP BY Month
          ORDER BY Month
""")

spark.sql("SELECT * FROM sales.monthly_orders").show()

+-----+-----------+
|Month|TotalAmount|
+-----+-----------+
|    1|   101200.0|
|    2|    28500.0|
|    3|    32500.0|
+-----+-----------+



### 6. String & Date Functions

Mask emails using regex (e.g: a***@gmail.com )

In [None]:
customer_df = customer_df.withColumn(
    "MaskedEmail",
    concat(lower(customer_df.Name.substr(1, 1)), lit("***@gmail.com"))
)

customer_df.select("CustomerID", "Name", "MaskedEmail").show()

+----------+-----+--------------+
|CustomerID| Name|   MaskedEmail|
+----------+-----+--------------+
|       101|  Ali|a***@gmail.com|
|       102| Neha|n***@gmail.com|
|       103| Ravi|r***@gmail.com|
|       104|Sneha|s***@gmail.com|
|       105| Amit|a***@gmail.com|
+----------+-----+--------------+



Concatenate Name and City as "Name from City"

In [None]:
customer_df = customer_df.withColumn(
    "CityInfo",
    concat_ws(" from ", "Name", "City")
)

customer_df.select("CustomerID", "Name", "City", "CityInfo").show()

+----------+-----+---------+--------------------+
|CustomerID| Name|     City|            CityInfo|
+----------+-----+---------+--------------------+
|       101|  Ali|   Mumbai|     Ali from Mumbai|
|       102| Neha|    Delhi|     Neha from Delhi|
|       103| Ravi|Bangalore| Ravi from Bangalore|
|       104|Sneha|Hyderabad|Sneha from Hyderabad|
|       105| Amit|  Chennai|   Amit from Chennai|
+----------+-----+---------+--------------------+



Use datediff() to calculate customer age in days.

In [None]:
customer_df = customer_df.withColumn("CustomerAgeInDays",
                                      datediff(current_date(), to_date("SignupDate"))
                                    )

customer_df.select("CustomerID", "Name", "SignupDate", "CustomerAgeInDays").show()

+----------+-----+-------------------+-----------------+
|CustomerID| Name|         SignupDate|CustomerAgeInDays|
+----------+-----+-------------------+-----------------+
|       101|  Ali|2022-05-10 00:00:00|             1126|
|       102| Neha|2023-01-15 00:00:00|              876|
|       103| Ravi|2021-11-01 00:00:00|             1316|
|       104|Sneha|2020-07-22 00:00:00|             1783|
|       105| Amit|2023-03-10 00:00:00|              822|
+----------+-----+-------------------+-----------------+



Extract month name from OrderDate .

In [None]:
orders_df = order_df.withColumn("OrderMonthName",
                                date_format("OrderDate", "MMMM")
                                )

orders_df.select("OrderID", "OrderDate", "OrderMonthName").show()

+-------+-------------------+--------------+
|OrderID|          OrderDate|OrderMonthName|
+-------+-------------------+--------------+
|      1|2024-01-10 00:00:00|       January|
|      2|2024-01-15 00:00:00|       January|
|      3|2024-02-01 00:00:00|      February|
|      4|2024-02-10 00:00:00|      February|
|      5|2024-02-15 00:00:00|      February|
|      6|2024-03-01 00:00:00|         March|
|      7|2024-03-02 00:00:00|         March|
+-------+-------------------+--------------+



### 7. UDFs and Complex Logic

Write a UDF to tag customers: \
 “Gold” if spend > 50K, “Silver” if 10K–50K, “Bronze” if <10K.

In [None]:
def customer_tags(spend):
    if spend > 50000:
        return "Gold"
    if spend >= 10000:
        return "Silver"
    return "Bronze"

tag_udf = udf(customer_tags, StringType())

customer_spend_df = orders_df.groupBy("CustomerID") \
                             .agg(sum("TotalAmount") \
                             .alias("TotalSpend"))

tagged_customer_df = customer_spend_df.withColumn("CustomerTag", tag_udf("TotalSpend"))
tagged_customer_df.show()

+----------+----------+-----------+
|CustomerID|TotalSpend|CustomerTag|
+----------+----------+-----------+
|       101|  101200.0|       Gold|
|       103|    3500.0|     Bronze|
|       102|   50000.0|     Silver|
|       105|    2500.0|     Bronze|
|       104|    5000.0|     Bronze|
+----------+----------+-----------+



 Write a UDF to shorten product names (first 3 letters + ...).

In [None]:
def shorten_product_names(name):
    if name is None:
        return None
    return name[:3] + "..."

shorten_udf = udf(shorten_product_names, StringType())

order_df = order_df.withColumn("Product_name_in_short", shorten_udf("Product"))
order_df.select("OrderID", "CustomerID", "Product", "Product_name_in_short").show()

+-------+----------+---------+---------------------+
|OrderID|CustomerID|  Product|Product_name_in_short|
+-------+----------+---------+---------------------+
|      1|       101|   Laptop|               Lap...|
|      2|       101|    Mouse|               Mou...|
|      3|       102|   Tablet|               Tab...|
|      4|       103|Bookshelf|               Boo...|
|      5|       104|    Mixer|               Mix...|
|      6|       105| Notebook|               Not...|
|      7|       102|    Phone|               Pho...|
+-------+----------+---------+---------------------+



### 8. Parquet & Views

Save the joined result as a Parquet file.

In [None]:
customer_order_df.write.mode("overwrite").parquet("/mnt/data/customer_orders.parquet")

Read it back and verify schema.

In [None]:
parquet_df = spark.read.parquet("/mnt/data/customer_orders.parquet")
parquet_df.printSchema()
parquet_df.show()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: timestamp (nullable = true)
 |-- Customer_type: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: timestamp (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)

+----------+-----+---------+-------------------+-------------+-------+---------+-----------+--------+-------+-------------------+-----------+---------+---------+
|CustomerID| Name|     City|         SignupDate|Customer_type|OrderID|  Product|   Category|Quantity|  Price|          OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+-------------------+-------------+-------+---------+-----------+------

 Create and query a global temp view.

In [None]:
parquet_df.createGlobalTempView("global_customer_orders")

spark.sql("SELECT * FROM global_temp.global_customer_orders").show()

+----------+-----+---------+-------------------+-------------+-------+---------+-----------+--------+-------+-------------------+-----------+---------+---------+
|CustomerID| Name|     City|         SignupDate|Customer_type|OrderID|  Product|   Category|Quantity|  Price|          OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+-------------------+-------------+-------+---------+-----------+--------+-------+-------------------+-----------+---------+---------+
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|          New|      1|   Laptop|Electronics|       2|50000.0|2024-01-10 00:00:00|   100000.0|     2024|     High|
|       101|  Ali|   Mumbai|2022-05-10 00:00:00|          New|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15 00:00:00|     1200.0|     2024|      Low|
|       102| Neha|    Delhi|2023-01-15 00:00:00|          New|      3|   Tablet|Electronics|       1|20000.0|2024-02-01 00:00:00|    20000.0|     2024|     High|
|       103| Ravi|Bangalore|

Compare performance between CSV read and Parquet read.

In [None]:
# Reading CSV File
start_csv = time.time()
csv_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/PysparkAssessment/customers.csv")
csv_df.count()

# Calculating time for csv
end_csv = time.time()

# Reading Parquet File
start_parquet = time.time()
parquet_df = spark.read.parquet("/mnt/data/customer_orders.parquet")
parquet_df.count()

# Calculating Time for Parquet
end_parquet = time.time()

# Displaying the result
print(f"CSV read time: {end_csv - start_csv:.2f} seconds")
print(f"Parquet read time: {end_parquet - start_parquet:.2f} seconds")

CSV read time: 0.80 seconds
Parquet read time: 0.61 seconds
