In [3]:
from google.colab import files
files.upload()


Saving orders.csv to orders (1).csv
Saving customers.csv to customers (1).csv


{'orders (1).csv': b'OrderID,CustomerID,Product,Category,Quantity,Price,OrderDate\r\n1,101,Laptop,Electronics,2,50000,10-01-2024\r\n2,101,Mouse,Electronics,1,1200,15-01-2024\r\n3,102,Tablet,Electronics,1,20000,01-02-2024\r\n4,103,Bookshelf,Furniture,1,3500,10-02-2024\r\n5,104,Mixer,Appliances,1,5000,15-02-2024\r\n6,105,Notebook,Stationery,5,500,01-03-2024\r\n7,102,Phone,Electronics,1,30000,02-03-2024\r\n',
 'customers (1).csv': b'CustomerID,Name,Email,City,SignupDate\r\n101,Ali,ali@gmail.com,Mumbai,10-05-2022\r\n102,Neha,neha@yahoo.com,Delhi,15-01-2023\r\n103,Ravi,ravi@hotmail.com,Bangalore,01-11-2021\r\n104,Sneha,sneha@outlook.com,Hyderabad,22-07-2020\r\n105,Amit,amit@gmail.com,Chennai,10-03-2023\r\n'}

In [4]:
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)


In [6]:
# Setup Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("PracticeProject").getOrCreate()

# Task 1: Data Ingestion & Exploration (Updated for Colab)
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)

customers_df.printSchema()
orders_df.printSchema()

print("Total Customers:", customers_df.count())
print("Total Orders:", orders_df.count())
customers_df.select("City").distinct().show()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: string (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: string (nullable = true)

Total Customers: 5
Total Orders: 7
+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [7]:
# Task 2: DataFrame Transformations

orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

orders_df = orders_df.withColumn("OrderYear", year("OrderDate"))

orders_df.filter(col("TotalAmount") > 10000).show()

customers_df = customers_df.drop("Email")

customers_df.show()
orders_df.show()


+-------+----------+-------+-----------+--------+-----+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-----+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000|10-01-2024|     100000|     NULL|
|      3|       102| Tablet|Electronics|       1|20000|01-02-2024|      20000|     NULL|
|      7|       102|  Phone|Electronics|       1|30000|02-03-2024|      30000|     NULL|
+-------+----------+-------+-----------+--------+-----+----------+-----------+---------+

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|10-05-2022|
|       102| Neha|    Delhi|15-01-2023|
|       103| Ravi|Bangalore|01-11-2021|
|       104|Sneha|Hyderabad|22-07-2020|
|       105| Amit|  Chennai|10-03-2023|
+----------+-----+---------+----------+

+-------+------

In [8]:
# Task 3: Handling Nulls & Conditionals

customers_df = customers_df.withColumn("City", when(col("City") == "Chennai", None).otherwise(col("City")))
customers_df = customers_df.fillna({"City": "Unknown"})

customers_df = customers_df.withColumn(
    "CustomerStatus",
    when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New")
)

orders_df = orders_df.withColumn(
    "OrderType",
    when(col("TotalAmount") < 5000, "Low").otherwise("High")
)

customers_df.show()
orders_df.select("OrderID", "Product", "TotalAmount", "OrderType").show()


+----------+-----+---------+----------+--------------+
|CustomerID| Name|     City|SignupDate|CustomerStatus|
+----------+-----+---------+----------+--------------+
|       101|  Ali|   Mumbai|10-05-2022|         Loyal|
|       102| Neha|    Delhi|15-01-2023|         Loyal|
|       103| Ravi|Bangalore|01-11-2021|         Loyal|
|       104|Sneha|Hyderabad|22-07-2020|           New|
|       105| Amit|  Unknown|10-03-2023|         Loyal|
+----------+-----+---------+----------+--------------+

+-------+---------+-----------+---------+
|OrderID|  Product|TotalAmount|OrderType|
+-------+---------+-----------+---------+
|      1|   Laptop|     100000|     High|
|      2|    Mouse|       1200|      Low|
|      3|   Tablet|      20000|     High|
|      4|Bookshelf|       3500|      Low|
|      5|    Mixer|       5000|     High|
|      6| Notebook|       2500|      Low|
|      7|    Phone|      30000|     High|
+-------+---------+-----------+---------+



In [9]:
# Task 4: Joins & Aggregations


joined_df = customers_df.join(orders_df, on="CustomerID", how="inner")

joined_df.groupBy("City").agg(
    count("OrderID").alias("TotalOrders"),
    sum("TotalAmount").alias("TotalRevenue")
).show()

joined_df.groupBy("Name").agg(
    sum("TotalAmount").alias("TotalSpend")
).orderBy(desc("TotalSpend")).show(3)

orders_df.groupBy("Category").agg(
    count("Product").alias("ProductsSold")
).show()


+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|Bangalore|          1|        3500|
|   Mumbai|          2|      101200|
|  Unknown|          1|        2500|
|    Delhi|          2|       50000|
|Hyderabad|          1|        5000|
+---------+-----------+------------+

+-----+----------+
| Name|TotalSpend|
+-----+----------+
|  Ali|    101200|
| Neha|     50000|
|Sneha|      5000|
+-----+----------+
only showing top 3 rows

+-----------+------------+
|   Category|ProductsSold|
+-----------+------------+
| Stationery|           1|
|Electronics|           4|
|  Furniture|           1|
| Appliances|           1|
+-----------+------------+



In [10]:
# Task 5: Spark SQL Tasks


customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

spark.sql("""
    SELECT o.* FROM orders o
    JOIN customers c ON o.CustomerID = c.CustomerID
    WHERE c.City = 'Delhi'
""").show()

spark.sql("""
    SELECT Category, AVG(TotalAmount) as AverageOrderValue
    FROM orders
    GROUP BY Category
""").show()

monthly_orders_df = orders_df.withColumn("OrderMonth", date_format("OrderDate", "MMMM"))
monthly_orders_df.createOrReplaceTempView("monthly_orders")

spark.sql("""
    SELECT OrderMonth, SUM(TotalAmount) as MonthlyTotal
    FROM monthly_orders
    GROUP BY OrderMonth
""").show()


+-------+----------+-------+-----------+--------+-----+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+-----+----------+-----------+---------+---------+
|      3|       102| Tablet|Electronics|       1|20000|01-02-2024|      20000|     NULL|     High|
|      7|       102|  Phone|Electronics|       1|30000|02-03-2024|      30000|     NULL|     High|
+-------+----------+-------+-----------+--------+-----+----------+-----------+---------+---------+

+-----------+-----------------+
|   Category|AverageOrderValue|
+-----------+-----------------+
| Stationery|           2500.0|
|Electronics|          37800.0|
|  Furniture|           3500.0|
| Appliances|           5000.0|
+-----------+-----------------+

+----------+------------+
|OrderMonth|MonthlyTotal|
+----------+------------+
|      NULL|      162200|
+----------+------------+



In [11]:
# Task 6: String & Date Functions

masked_df = customers_df.withColumn(
    "MaskedEmail",
    regexp_replace("Name", r"(.).*", r"\1***@example.com")
)

masked_df = masked_df.withColumn("NameCity", concat_ws(" from ", col("Name"), col("City")))

masked_df = masked_df.withColumn("CustomerAgeInDays", datediff(current_date(), col("SignupDate")))

orders_df = orders_df.withColumn("MonthName", date_format("OrderDate", "MMMM"))

masked_df.select("CustomerID", "NameCity", "CustomerAgeInDays").show()
orders_df.select("OrderID", "OrderDate", "MonthName").show()


+----------+--------------------+-----------------+
|CustomerID|            NameCity|CustomerAgeInDays|
+----------+--------------------+-----------------+
|       101|     Ali from Mumbai|             NULL|
|       102|     Neha from Delhi|             NULL|
|       103| Ravi from Bangalore|             NULL|
|       104|Sneha from Hyderabad|             NULL|
|       105|   Amit from Unknown|             NULL|
+----------+--------------------+-----------------+

+-------+----------+---------+
|OrderID| OrderDate|MonthName|
+-------+----------+---------+
|      1|10-01-2024|     NULL|
|      2|15-01-2024|     NULL|
|      3|01-02-2024|     NULL|
|      4|10-02-2024|     NULL|
|      5|15-02-2024|     NULL|
|      6|01-03-2024|     NULL|
|      7|02-03-2024|     NULL|
+-------+----------+---------+



In [12]:
# Task 7: UDFs and Complex Logic


def tag_customer(spend):
    return "Gold" if spend > 50000 else "Silver" if spend >= 10000 else "Bronze"

def shorten_product(name):
    return name[:3] + "..." if len(name) > 3 else name

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

tag_udf = udf(tag_customer, StringType())
shorten_udf = udf(shorten_product, StringType())

spend_df = joined_df.groupBy("CustomerID").agg(sum("TotalAmount").alias("TotalSpend"))
spend_df = spend_df.withColumn("CustomerTag", tag_udf("TotalSpend"))
spend_df.show()

orders_df = orders_df.withColumn("ShortProduct", shorten_udf("Product"))
orders_df.select("Product", "ShortProduct").show()


+----------+----------+-----------+
|CustomerID|TotalSpend|CustomerTag|
+----------+----------+-----------+
|       101|    101200|       Gold|
|       103|      3500|     Bronze|
|       102|     50000|     Silver|
|       105|      2500|     Bronze|
|       104|      5000|     Bronze|
+----------+----------+-----------+

+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



In [13]:
# Task 8: Parquet & Views

joined_df.write.mode("overwrite").parquet("joined_result.parquet")

parquet_df = spark.read.parquet("joined_result.parquet")
parquet_df.printSchema()

parquet_df.createOrReplaceGlobalTempView("global_orders")

spark.sql("SELECT * FROM global_temp.global_orders LIMIT 5").show()

import time

start_csv = time.time()
_ = spark.read.csv("orders.csv", header=True, inferSchema=True)
print("CSV read time:", time.time() - start_csv, "seconds")

start_parquet = time.time()
_ = spark.read.parquet("joined_result.parquet")
print("Parquet read time:", time.time() - start_parquet, "seconds")


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: string (nullable = true)
 |-- CustomerStatus: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- TotalAmount: integer (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)

+----------+-----+---------+----------+--------------+-------+---------+-----------+--------+-----+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerStatus|OrderID|  Product|   Category|Quantity|Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+--------------+-------+---------+-----------+--------+-----+----------+-----------+---------+-------