In [1]:
# Creating a spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkBasics").getOrCreate()

In [2]:
spark

In [3]:
# Creating csv
data = """
product_id, product_name, category, price, quantity

101, Laptop, Electronics, 55000, 10
102, Smartphone, Electronics, 30000, 25
103, Chair, Furniture, 2500, 50
104, Book, Stationery, 400, 200
105, Headphones, Electronics, 1500, 100
106, Table, Furniture, 3200, 40
107, Pen, Stationery, 20, 500
108, Monitor, Electronics, 12000, 15
109, Notebook, Stationery, 60, 300
110, Sofa, Furniture, 45000, 5
"""

with open('employees.csv','w')as file:
  file.write(data)

In [6]:
# Read the above data from CSV into a DataFrame and print the schema.
df = spark.read.csv('employees.csv', header=True, inferSchema=True)
df.show()
df.printSchema()

+----------+-------------+------------+-------+---------+
|product_id| product_name|    category|  price| quantity|
+----------+-------------+------------+-------+---------+
|       101|       Laptop| Electronics|55000.0|     10.0|
|       102|   Smartphone| Electronics|30000.0|     25.0|
|       103|        Chair|   Furniture| 2500.0|     50.0|
|       104|         Book|  Stationery|  400.0|    200.0|
|       105|   Headphones| Electronics| 1500.0|    100.0|
|       106|        Table|   Furniture| 3200.0|     40.0|
|       107|          Pen|  Stationery|   20.0|    500.0|
|       108|      Monitor| Electronics|12000.0|     15.0|
|       109|     Notebook|  Stationery|   60.0|    300.0|
|       110|         Sofa|   Furniture|45000.0|      5.0|
+----------+-------------+------------+-------+---------+

root
 |-- product_id: integer (nullable = true)
 |--  product_name: string (nullable = true)
 |--  category: string (nullable = true)
 |--  price: double (nullable = true)
 |--  quantity:

In [8]:
# Read the same data from JSON and compare with the CSV schema
# Converting csv to json

import pandas as pd
from io import StringIO
df_pd = pd.read_csv(StringIO(data.strip()))
df_pd.to_json("employees.json", orient="records", lines=True)

# Reading json
import json
df_json = spark.read.json("employees.json")
df_json.show()
df_json.printSchema()

+------------+------+-------------+---------+----------+
|    category| price| product_name| quantity|product_id|
+------------+------+-------------+---------+----------+
| Electronics| 55000|       Laptop|       10|       101|
| Electronics| 30000|   Smartphone|       25|       102|
|   Furniture|  2500|        Chair|       50|       103|
|  Stationery|   400|         Book|      200|       104|
| Electronics|  1500|   Headphones|      100|       105|
|   Furniture|  3200|        Table|       40|       106|
|  Stationery|    20|          Pen|      500|       107|
| Electronics| 12000|      Monitor|       15|       108|
|  Stationery|    60|     Notebook|      300|       109|
|   Furniture| 45000|         Sofa|        5|       110|
+------------+------+-------------+---------+----------+

root
 |--  category: string (nullable = true)
 |--  price: long (nullable = true)
 |--  product_name: string (nullable = true)
 |--  quantity: long (nullable = true)
 |-- product_id: long (nullable = t

In [9]:
# Converting to csv parquet and saving it to disk
df.write.mode("overwrite").parquet("employees_parquet")

In [10]:
import os
os.listdir('/content/employees_parquet')

['part-00000-c46f70c6-6602-410c-b574-50cf3316350f-c000.snappy.parquet',
 '._SUCCESS.crc',
 '_SUCCESS',
 '.part-00000-c46f70c6-6602-410c-b574-50cf3316350f-c000.snappy.parquet.crc']

In [11]:
# Measure the size of CSV vs JSON vs Parquet on disk. Which one is smallest?
# saved files in csv, json and parquet formats

# csv size
!du -sh employees.csv
# json size
!du -sh employees.json
# parquet size
!du -sh employees_parquet/

4.0K	employees.csv
4.0K	employees.json
16K	employees_parquet/


In [12]:
# Actual size of parquet
!ls -lh employees_parquet/
# It shows 16k due to metadata

total 4.0K
-rw-r--r-- 1 root root 1.8K Aug  7 05:41 part-00000-c46f70c6-6602-410c-b574-50cf3316350f-c000.snappy.parquet
-rw-r--r-- 1 root root    0 Aug  7 05:41 _SUCCESS


In [15]:
# Add a column total_revenue = price * quantity for each record
from pyspark.sql.functions import col
df = df.withColumn("total_revenue", col(" price")*col(" quantity"))
df.show()

+----------+-------------+------------+-------+---------+-------------+
|product_id| product_name|    category|  price| quantity|total_revenue|
+----------+-------------+------------+-------+---------+-------------+
|       101|       Laptop| Electronics|55000.0|     10.0|     550000.0|
|       102|   Smartphone| Electronics|30000.0|     25.0|     750000.0|
|       103|        Chair|   Furniture| 2500.0|     50.0|     125000.0|
|       104|         Book|  Stationery|  400.0|    200.0|      80000.0|
|       105|   Headphones| Electronics| 1500.0|    100.0|     150000.0|
|       106|        Table|   Furniture| 3200.0|     40.0|     128000.0|
|       107|          Pen|  Stationery|   20.0|    500.0|      10000.0|
|       108|      Monitor| Electronics|12000.0|     15.0|     180000.0|
|       109|     Notebook|  Stationery|   60.0|    300.0|      18000.0|
|       110|         Sofa|   Furniture|45000.0|      5.0|     225000.0|
+----------+-------------+------------+-------+---------+-------

In [16]:
# Find the top 3 products with the highest total revenue.
df.orderBy(col("total_revenue").desc()).limit(3).show()

+----------+-------------+------------+-------+---------+-------------+
|product_id| product_name|    category|  price| quantity|total_revenue|
+----------+-------------+------------+-------+---------+-------------+
|       102|   Smartphone| Electronics|30000.0|     25.0|     750000.0|
|       101|       Laptop| Electronics|55000.0|     10.0|     550000.0|
|       110|         Sofa|   Furniture|45000.0|      5.0|     225000.0|
+----------+-------------+------------+-------+---------+-------------+



In [20]:
# Filter and display only Furniture products with price > 3000
df_furniture = df.filter((col(" category")== " Furniture") & (col(" price") > 3000))
df_furniture.show()

+----------+-------------+----------+-------+---------+-------------+
|product_id| product_name|  category|  price| quantity|total_revenue|
+----------+-------------+----------+-------+---------+-------------+
|       106|        Table| Furniture| 3200.0|     40.0|     128000.0|
|       110|         Sofa| Furniture|45000.0|      5.0|     225000.0|
+----------+-------------+----------+-------+---------+-------------+



In [21]:
# Create a new column price_band with values:'High' if price > 10000 'Medium' if 3000 < price <= 10000 'Low' if price ≤ 3000
from pyspark.sql.functions import avg, col, when

df_band = df.withColumn("price_band",
                        when(col(" price") > 10000, "High")
                        .when((col(" price") >3000) & (col(" price") <= 10000), "Medium")
                        .otherwise("Low")
                        )
df_band.show()



+----------+-------------+------------+-------+---------+-------------+----------+
|product_id| product_name|    category|  price| quantity|total_revenue|price_band|
+----------+-------------+------------+-------+---------+-------------+----------+
|       101|       Laptop| Electronics|55000.0|     10.0|     550000.0|      High|
|       102|   Smartphone| Electronics|30000.0|     25.0|     750000.0|      High|
|       103|        Chair|   Furniture| 2500.0|     50.0|     125000.0|       Low|
|       104|         Book|  Stationery|  400.0|    200.0|      80000.0|       Low|
|       105|   Headphones| Electronics| 1500.0|    100.0|     150000.0|       Low|
|       106|        Table|   Furniture| 3200.0|     40.0|     128000.0|    Medium|
|       107|          Pen|  Stationery|   20.0|    500.0|      10000.0|       Low|
|       108|      Monitor| Electronics|12000.0|     15.0|     180000.0|      High|
|       109|     Notebook|  Stationery|   60.0|    300.0|      18000.0|       Low|
|   

In [23]:
# Group by category and calculate total quantity sold.
from pyspark.sql.functions import sum

df_grouped = df.groupBy(" category").agg(sum(" quantity").alias("total_quantity"))
df_grouped.show()

+------------+--------------+
|    category|total_quantity|
+------------+--------------+
|  Stationery|        1000.0|
|   Furniture|          95.0|
| Electronics|         150.0|
+------------+--------------+



In [24]:
# Calculate average price of products for each category
df_avg = df.groupBy(" category").agg(avg(" price").alias("avg_price"))
df_avg.show()


+------------+---------+
|    category|avg_price|
+------------+---------+
|  Stationery|    160.0|
|   Furniture|  16900.0|
| Electronics|  24625.0|
+------------+---------+



In [25]:
# Count how many products fall in each price_band
df_bnd_count = df_band.groupBy("price_band").count()
df_bnd_count.show()


+----------+-----+
|price_band|count|
+----------+-----+
|      High|    4|
|       Low|    5|
|    Medium|    1|
+----------+-----+



In [30]:
# Write the filtered Electronics products (price > 5000) into a Parquet file
df_elctronics = df.filter((col(" category") == " Electronics") & (col(" price") > 5000))
#df_elctronics.show()

df_elctronics.write.mode("overwrite").parquet("/tmp/electronics_parquet")


In [32]:
# Write the Stationery products into a JSON file

df_stationary = df.filter(col(" category") == " Stationery")
#df_stationary.show()
df_stationary.write.mode("overwrite").json("/tmp/stationery_json")



In [34]:
# Load Parquet back and run a query to find which category has highest total revenue

df_par = spark.read.parquet("/content/employees_parquet")


In [37]:
#df_par.show()
df_par = df_par.withColumn("total_revenue", col(" price") * col(" quantity"))
df_rev = df_par.groupBy(" category").sum("total_revenue")



In [39]:
df_rev.show()

+------------+------------------+
|    category|sum(total_revenue)|
+------------+------------------+
|  Stationery|          108000.0|
|   Furniture|          478000.0|
| Electronics|         1630000.0|
+------------+------------------+



In [40]:
df_top = df_rev.orderBy(col("sum(total_revenue)").desc()).limit(1)
df_top.show()

+------------+------------------+
|    category|sum(total_revenue)|
+------------+------------------+
| Electronics|         1630000.0|
+------------+------------------+



In [47]:
# BONUS: Create a temporary view from the DataFrame and run Spark SQL to find all
# products with quantity > 100 and price < 1000
df_cleaned = df.toDF(*[col_name.strip() for col_name in df.columns])
df_cleaned.createOrReplaceTempView("product_view")

In [48]:
spark.sql("select * from product_view where  quantity > 100 and  price < 1000").show()

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       104|        Book| Stationery|400.0|   200.0|      80000.0|
|       107|         Pen| Stationery| 20.0|   500.0|      10000.0|
|       109|    Notebook| Stationery| 60.0|   300.0|      18000.0|
+----------+------------+-----------+-----+--------+-------------+

