In [4]:
!pip install -q pyspark==3.5.1 delta-spark==3.1.0

# Delta supported spark session
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder \
    .appName("DeltaLakeColab") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark

In [5]:
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""
with open("student_scores.csv", "w") as f:
  f.write(csv_data)

### Step 1: Read into Dataframe

In [6]:
df = spark.read.option("header", True).option("inferSchema",True).csv("student_scores.csv")
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



### Step 2: Write to Delta

In [7]:
df.write.format("delta").mode("overwrite").save("student_scores")

In [11]:
# Can't register delta table so creating temp view
df = spark.read.format("delta").load("student_scores")

df.createOrReplaceTempView("student_scores")

spark.sql("select * from student_scores").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



## Basic Tasks

In [12]:
# Show all students and their scores

spark.sql("select name, score from student_scores").show()

+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
|Rahul|   78|
|Sneha|   65|
|Aryan|   55|
| Isha|   88|
|Tanvi|   91|
|Kunal|   72|
|Megha|   60|
|Rohan|   40|
+-----+-----+



In [13]:
# Count number of students in each subject

spark.sql("select subject, count(*)from student_scores group by subject").show()

+-------+--------+
|subject|count(1)|
+-------+--------+
|Science|       3|
|   Math|       4|
|English|       3|
+-------+--------+



In [14]:
# Find average score per subject
from pyspark.sql.functions import avg
spark.sql("select subject, avg(score) from student_scores group by subject").show()

+-------+-----------------+
|subject|       avg(score)|
+-------+-----------------+
|Science|             73.0|
|   Math|            70.25|
|English|75.33333333333333|
+-------+-----------------+



In [15]:
# List all students who scored more than 80

spark.sql("select * from student_scores where score > 80").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
+----------+-----+-------+-----+-----+



### Advanced Tasks

In [17]:
# Show student(s) with the highest score in each subject
spark.sql("select subject, name, score from (select *, RANK() over (partition by subject order by score desc)as rank from student_scores)where rank = 1 ").show()

+-------+-----+-----+
|subject| name|score|
+-------+-----+-----+
|English| Isha|   88|
|   Math|Tanvi|   91|
|Science|Divya|   92|
+-------+-----+-----+



In [18]:
# Display grades with count of students in each
spark.sql("select grade, count(*)as students from student_scores group by grade").show()

+-----+--------+
|grade|students|
+-----+--------+
|    F|       1|
|    B|       2|
|    D|       1|
|    C|       2|
|    A|       4|
+-----+--------+



In [19]:
# Show names of students who failed (grade F)
spark.sql("select * from student_scores where grade = 'F'").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [20]:
# List students with score between 60 and 90
spark.sql("select * from student_scores where score between 60 and 90").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         6| Isha|English|   88|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
+----------+-----+-------+-----+-----+



In [22]:
# Rank students within each subject based on scores
spark.sql("select *,RANK() over (partition by subject order by score desc)as rank from student_scores").show()

+----------+-----+-------+-----+-----+----+
|student_id| name|subject|score|grade|rank|
+----------+-----+-------+-----+-----+----+
|         6| Isha|English|   88|    A|   1|
|         3|Rahul|English|   78|    B|   2|
|         9|Megha|English|   60|    C|   3|
|         7|Tanvi|   Math|   91|    A|   1|
|         1|Ankit|   Math|   85|    A|   2|
|         4|Sneha|   Math|   65|    C|   3|
|        10|Rohan|   Math|   40|    F|   4|
|         2|Divya|Science|   92|    A|   1|
|         8|Kunal|Science|   72|    B|   2|
|         5|Aryan|Science|   55|    D|   3|
+----------+-----+-------+-----+-----+----+



### Update & Delete Tasks

In [24]:
# Increase score of all English subject students by 5
spark.sql("update student_scores set score = score + 5 where subject = 'English'")
spark.sql("select * from student_scores").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   88|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   98|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   70|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [25]:
# Delete all records where score is less than 50
spark.sql("delete from student_scores where score < 50")
spark.sql("select * from student_scores").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   88|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   98|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   70|    C|
+----------+-----+-------+-----+-----+



In [28]:
# Add a new column pass_status (PASS if score >= 50 else FAIL)

spark.sql("select *, case when score >= 50 then 'PASS' else 'FAIL' end as pass_status from student_scores; ").show()

+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         1|Ankit|   Math|   85|    A|       PASS|
|         2|Divya|Science|   92|    A|       PASS|
|         3|Rahul|English|   88|    B|       PASS|
|         4|Sneha|   Math|   65|    C|       PASS|
|         5|Aryan|Science|   55|    D|       PASS|
|         6| Isha|English|   98|    A|       PASS|
|         7|Tanvi|   Math|   91|    A|       PASS|
|         8|Kunal|Science|   72|    B|       PASS|
|         9|Megha|English|   70|    C|       PASS|
+----------+-----+-------+-----+-----+-----------+



### Data Transformation & View

In [30]:
# Create a temporary view and run SQL to get average scores
spark.sql("select subject, round(avg(score),3) as AvgScore from student_scores group by subject").show()

+-------+--------+
|subject|AvgScore|
+-------+--------+
|Science|    73.0|
|   Math|  80.333|
|English|  85.333|
+-------+--------+



In [31]:
# Convert updated DataFrame into a new Delta table called student_scores_v2

spark.sql("""CREATE OR REPLACE TABLE student_scores_v2 USING DELTA AS SELECT * FROM student_scores""")


DataFrame[]

In [32]:
# Write the final data to Parquet and JSON formats
# loading delta table
df_final = spark.table("student_scores")

df_final.write.mode("overwrite").parquet("student_parquet")
df_final.write.mode("overwrite").json("students_json")