In [2]:
from pyspark.sql import SparkSession

spark =SparkSession.builder.appName("PySparkAssessment1").enableHiveSupport().getOrCreate()
spark

In [4]:

import os

file_path = '/content/drive/MyDrive/PysparkDemo/large_employee_dataset.csv'
print(os.path.exists(file_path))

True


In [56]:
# 1. Data Ingestion & Exploration
# Load both CSV files with schema inference.
# List all columns and data types.
# Count the total number of customers and orders.
# Show distinct cities.

from google.colab import drive
drive.mount('/content/drive',force_remount=True)

from pyspark.sql.functions import avg, max, sum, count, year, current_date, datediff, col

customers =spark.read.csv('/content/drive/MyDrive/PysparkDemo/customer.csv',header=True,inferSchema=True)
orders = spark.read.csv('/content/drive/MyDrive/PysparkDemo/orders.csv',header=True,inferSchema=True)

customers.show()
orders.show()

customers.printSchema()
orders.printSchema()

print("Total Customers:", customers.count())
print("Total Orders:", orders.count())

customers.select("City").distinct().show()



Mounted at /content/drive
+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

+-------+----------+---------+-----------+--------+-------+----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+---------+-----------+--------+-------+----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|
|      4|       103|Books

In [38]:
# 2. DataFrame Transformations
# Add a column TotalAmount = Price * Quantity .
# Create a new column OrderYear from OrderDate .
# Filter orders with TotalAmount > 10,000 .
# Drop the Email column from customers .


orders=orders.withColumn("TotalAmount",col("Price")*col("Quantity"))
orders.show()

orders=orders.withColumn("OrderYear",year(col("OrderDate")))
orders.show()

orders.filter(col("TotalAmount")>10000)
orders.show()

emaildrop=customers.drop(col("Email"))
emaildrop.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+

+-------+----------+---------+-----------+--------+-------+----------+-----------+----

In [39]:
# 3. Handling Nulls & Conditionals
# Simulate a null in City and fill it with “Unknown”.
# Label customers as “Loyal” if SignupDate is before 2022, else “New”.
# Create OrderType column: "Low" if <5,000, "High" if ≥5,000.
from pyspark.sql.functions import when, lit, concat_ws

customers_null=customers.withColumn("City",when(col("City")=="Bangalore",None).otherwise(col("City")))
customers_null = customers_null.withColumn("City",when(col("City").isNull(), lit("Unknown")).otherwise(col("City")))
customers_null.show()

customers = customers.withColumn("NameCity", concat_ws(" from ", col("Name"), col("City")))

customers=customers.withColumn("CustomerType",when(year(col("SignupDate"))<2022,"Loyal").otherwise("New"))
customers.show()

orders=orders.withColumn("OrderType",when(col("TotalAmount")<5000,"Low").otherwise("High"))
orders.show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|  Unknown|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

+----------+-----+-----------------+---------+----------+--------------------+------------+
|CustomerID| Name|            Email|     City|SignupDate|            NameCity|CustomerType|
+----------+-----+-----------------+---------+----------+--------------------+------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     Ali from Mumbai|         New|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|     Neha from Delhi|         New|
|       103| Ravi

In [40]:
# 4. Joins & Aggregations
# Join customers and orders on CustomerID .
# Get total orders and revenue per city.
# Show top 3 customers by total spend.
# Count how many products each category has sold.

joined=orders.join(customers,"CustomerID")
joined.show()

city_revenue=joined.groupBy("City").sum("TotalAmount").alias("TotalRevenue")
city_revenue.show()

top=joined.groupBy("CustomeRID").agg(sum("TotalAmount").alias("TotalSpent")).orderBy(col("TotalSpent").desc()).limit(3).show()

category_count=joined.groupBy("Category").count()
category_count.show()

+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+-----------------+---------+----------+--------------------+------------+
|CustomerID|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType| Name|            Email|     City|SignupDate|            NameCity|CustomerType|
+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+-----------------+---------+----------+--------------------+------------+
|       101|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     Ali from Mumbai|         New|
|       101|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     Ali from Mumbai|         New|
|       102|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     202

In [41]:
# 5. Spark SQL Tasks
# Create database sales and switch to it.
# Save both datasets as tables in the sales database.
# Write SQL to:
# List all orders by customers from “Delhi”.
# Find average order value in each category.
# Create a view monthly_orders with month-wise total amount.
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

customers.createOrReplaceTempView("customers")
orders.createOrReplaceTempView("orders")

spark.sql("SELECT * FROM customers").show()
spark.sql("SELECT * FROM orders").show()

spark.sql("SELECT * FROM orders o join customers c on o.CustomerID=c.CustomerID WHERE City='Delhi' ").show()

spark.sql("SELECT Category, AVG(TotalAmount) AS AvgOrderValue FROM orders GROUP BY Category").show()

spark.sql("CREATE OR REPLACE TEMPORARY VIEW monthly_orders AS SELECT MONTH(OrderDate) AS Month, SUM(TotalAmount) AS TotalAmount FROM orders GROUP BY MONTH(OrderDate)")
spark.sql("SELECT * FROM monthly_orders").show()





+----------+-----+-----------------+---------+----------+--------------------+------------+
|CustomerID| Name|            Email|     City|SignupDate|            NameCity|CustomerType|
+----------+-----+-----------------+---------+----------+--------------------+------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     Ali from Mumbai|         New|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|     Neha from Delhi|         New|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01| Ravi from Bangalore|       Loyal|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|Sneha from Hyderabad|       Loyal|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|   Amit from Chennai|         New|
+----------+-----+-----------------+---------+----------+--------------------+------------+

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDat

In [42]:
# 6. String & Date Functions
# Mask emails using regex (e.g., a***@gmail.com ).
# Concatenate Name and City as “Name from City”.
# Use datediff() to calculate customer age in days.
# Extract month name from OrderDate .

# Mask email (in original df)
from pyspark.sql.functions import regexp_replace,concat_ws,date_format,to_date,datediff

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def mask_email(email):
    try:
        local, domain = email.split("@")
        if len(local) > 0:
            return local[0] + "***@" + domain
        else:
            return "***@" + domain
    except:
        return "Invalid"

mask_email_udf = udf(mask_email, StringType())

customers_masked = customers.withColumn("MaskedEmail", mask_email_udf(col("Email")))


customers_masked = customers_masked.withColumn("NameCity", concat_ws(" from ", col("Name"), col("City")))
customers_masked.show()

customers = customers.withColumn("SignupDate", to_date("SignupDate", "yyyy-MM-dd"))
customers = customers.withColumn("CustomerAgeDays", datediff(current_date(), col("SignupDate")))

customers.show()

orders = orders.withColumn("MonthName", date_format("OrderDate", "MMMM"))
orders.show()


+----------+-----+-----------------+---------+----------+--------------------+------------+----------------+
|CustomerID| Name|            Email|     City|SignupDate|            NameCity|CustomerType|     MaskedEmail|
+----------+-----+-----------------+---------+----------+--------------------+------------+----------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     Ali from Mumbai|         New|  a***@gmail.com|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|     Neha from Delhi|         New|  n***@yahoo.com|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01| Ravi from Bangalore|       Loyal|r***@hotmail.com|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|Sneha from Hyderabad|       Loyal|s***@outlook.com|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|   Amit from Chennai|         New|  a***@gmail.com|
+----------+-----+-----------------+---------+----------+--------------------+------------+----------------+

+----------+-----+

In [57]:
# 7. UDFs and Complex Logic
# Write a UDF to tag customers:
# “Gold” if spend > 50K, “Silver” if 10K–50K, “Bronze” if <10K.
# Write a UDF to shorten product names (first 3 letters + ...).
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def customertag(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_udf = udf(customertag, StringType())

customer_spending = joined.groupBy("CustomerID").agg(sum("TotalAmount").alias("TotalSpend"))
tagged_customers = customer_spending.withColumn("Tag", tag_udf("TotalSpend")).show()




def shortname(name):
    return name[:3] + "..." if name else None

shorten_udf = udf(shortname, StringType())

orders = orders.withColumn("ShortProduct", shorten_udf("Product"))
orders.show()



+----------+----------+------+
|CustomerID|TotalSpend|   Tag|
+----------+----------+------+
|       101|  101200.0|  Gold|
|       103|    3500.0|Bronze|
|       102|   50000.0|Silver|
|       105|    2500.0|Bronze|
|       104|    5000.0|Bronze|
+----------+----------+------+

+-------+----------+---------+-----------+--------+-------+----------+------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|ShortProduct|
+-------+----------+---------+-----------+--------+-------+----------+------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|      Lap...|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|      Mou...|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|      Tab...|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|      Boo...|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|      Mix...|
|      6|       105| Notebook| Stationery|      

In [60]:
# 8. Parquet & Views
# Save the joined result as a Parquet file.
# Read it back and verify schema.
# Create and query a global temp view.
# Compare performance between CSV read and Parquet read.

joined.write.mode("overwrite").parquet("joined_data.parquet")


joined_parquet = spark.read.parquet("joined_data.parquet")
joined_parquet.printSchema()

joined_parquet.createOrReplaceGlobalTempView("global_joined")

spark.sql("SELECT * FROM global_temp.global_joined WHERE City='Delhi' ").show()


from time import time

start = time()
spark.read.csv('/content/drive/MyDrive/PysparkDemo/orders.csv',header=True,inferSchema=True)
print("CSV read time:", time() - start)

start = time()
spark.read.parquet("joined_data.parquet").count()
print("Parquet read time:", time() - start)


root
 |-- CustomerID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- NameCity: string (nullable = true)
 |-- CustomerType: string (nullable = true)

+----------+-------+-------+-----------+--------+-------+----------+-----------+---------+---------+----+--------------+-----+----------+---------------+------------+
|CustomerID|OrderID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|Name|         Email| City|SignupDate|       NameCity|CustomerType|
+----------+-------+--