In [0]:

df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/FileStore/tables/student_scores.csv")
df.show()
df_delta = df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")
spark.sql("DROP TABLE IF EXISTS student_scores")

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



DataFrame[]

In [0]:
# Create and register Delta Table called sales_transactions .
df.write.format("delta").mode("overwrite").saveAsTable("student_scores")
spark.sql("select * from student_scores").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



#Basic Tasks

In [0]:
# 1. Show all students and their scores.
spark.sql("select name, score from student_scores").show()

+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
|Rahul|   78|
|Sneha|   65|
|Aryan|   55|
| Isha|   88|
|Tanvi|   91|
|Kunal|   72|
|Megha|   60|
|Rohan|   40|
+-----+-----+



In [0]:
# 2. Count number of students in each subject.
spark.sql("select subject, count(*) from student_scores group by subject").show()

+-------+--------+
|subject|count(1)|
+-------+--------+
|Science|       3|
|   Math|       4|
|English|       3|
+-------+--------+



In [0]:
# 3. Find average score per subject.
spark.sql("select subject, round(avg(score),2) as avg_score from student_scores group by subject").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     73.0|
|   Math|    70.25|
|English|    75.33|
+-------+---------+



In [0]:
# 4. List all students who scored more than 80.
spark.sql("select name, score from student_scores where score > 80").show()

+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
| Isha|   88|
|Tanvi|   91|
+-----+-----+



#Advanced Queries

In [0]:
# 5. Show student(s) with the highest score in each subject.
spark.sql("select subject, name, score from (select subject, name, score, rank() over (partition by subject order by score desc) as rnk from student_scores) where rnk = 1").show()

+-------+-----+-----+
|subject| name|score|
+-------+-----+-----+
|English| Isha|   88|
|   Math|Tanvi|   91|
|Science|Divya|   92|
+-------+-----+-----+



In [0]:
# 6. Display grades with count of students in each.
spark.sql("select grade, count(*) as students_count from student_scores group by grade").show()

+-----+--------------+
|grade|students_count|
+-----+--------------+
|    F|             1|
|    B|             2|
|    D|             1|
|    C|             2|
|    A|             4|
+-----+--------------+



In [0]:
# 7. Show names of students who failed (grade F).
spark.sql("select name from student_scores where grade = 'F'").show()

+-----+
| name|
+-----+
|Rohan|
+-----+



In [0]:
# 8. List students with score between 60 and 90.
spark.sql("select name, score from student_scores where score between 60 and 90").show()

+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Rahul|   78|
|Sneha|   65|
| Isha|   88|
|Kunal|   72|
|Megha|   60|
+-----+-----+



In [0]:
# 9. Rank students within each subject based on scores.
spark.sql("select subject, name, score, rank() over (partition by subject order by score desc) as rank from student_scores").show()

+-------+-----+-----+----+
|subject| name|score|rank|
+-------+-----+-----+----+
|English| Isha|   88|   1|
|English|Rahul|   78|   2|
|English|Megha|   60|   3|
|   Math|Tanvi|   91|   1|
|   Math|Ankit|   85|   2|
|   Math|Sneha|   65|   3|
|   Math|Rohan|   40|   4|
|Science|Divya|   92|   1|
|Science|Kunal|   72|   2|
|Science|Aryan|   55|   3|
+-------+-----+-----+----+



#Update & Delete Tasks

In [0]:
# 10.Increase score of all English subject students by 5.
from delta.tables import DeltaTable
delta_table = DeltaTable.forName(spark, "student_scores")
delta_table.update(
    condition="subject = 'English'",
    set={"score": "score + 5"}
)
delta_table.toDF().show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|        10|Rohan|   Math|   40|    F|
|         3|Rahul|English|   83|    B|
|         6| Isha|English|   93|    A|
|         9|Megha|English|   65|    C|
+----------+-----+-------+-----+-----+



In [0]:
# 11.Delete all records where score is less than 50.
delta_table.delete("score < 50")
delta_table.toDF().show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         3|Rahul|English|   83|    B|
|         6| Isha|English|   93|    A|
|         9|Megha|English|   65|    C|
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
+----------+-----+-------+-----+-----+



In [0]:
# 12. Add a new column pass_status (PASS if score >= 50 else FAIL).
spark.sql("select *, case when score >= 50 then 'PASS' else 'FAIL' end as pass_status from student_scores").show()

+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         3|Rahul|English|   83|    B|       PASS|
|         6| Isha|English|   93|    A|       PASS|
|         9|Megha|English|   65|    C|       PASS|
|         1|Ankit|   Math|   85|    A|       PASS|
|         2|Divya|Science|   92|    A|       PASS|
|         4|Sneha|   Math|   65|    C|       PASS|
|         5|Aryan|Science|   55|    D|       PASS|
|         7|Tanvi|   Math|   91|    A|       PASS|
|         8|Kunal|Science|   72|    B|       PASS|
+----------+-----+-------+-----+-----+-----------+



#Data Transformation & Views

In [0]:
# 13. Create a temporary view and run SQL to get average scores.
df.createOrReplaceTempView("student_scores_temp")

In [0]:
spark.sql("select subject, round(avg(score),2) as avg_score from student_scores_temp group by subject").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     73.0|
|   Math|    70.25|
|English|    75.33|
+-------+---------+



In [0]:
# 14. Convert updated DataFrame into a new Delta table called student_scores_v2 .
df.write.format("delta").mode("overwrite").saveAsTable("student_scores_v2")

In [0]:
# 15. Write the final data to Parquet and JSON formats.
df.write.parquet("/tmp/student_scores.parquet")
df.write.json("/tmp/student_scores.json")