#Dataset: Inline CSV – student_scores.csv

In [None]:
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""

dbutils.fs.put("dbfs:/tmp/student_scores.csv", csv_data, overwrite=True)

Wrote 234 bytes.


True

Step 1: Read into DataFrame

In [None]:
df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/student_scores.csv")
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



Step 2: Write to Delta

In [None]:
df.write.mode("overwrite").format("delta").save("/tmp/delta/student_scores")

Step 3: Register Delta Table

In [None]:
spark.sql("DROP TABLE IF EXISTS student_scores")

DataFrame[]

In [None]:
spark.sql("CREATE TABLE default.student_scores USING DELTA LOCATION 'dbfs:/tmp/delta/student_scores'")

In [None]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "/tmp/delta/student_scores")
df = delta_table.toDF()
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



Basic Tasks

1. Show all students and their scores.

In [None]:
df = spark.read.format("delta").load("/tmp/delta/student_scores")
df.createOrReplaceTempView("student_scores")

In [None]:
spark.sql("SELECT name, score FROM student_scores").show()

+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
|Rahul|   78|
|Sneha|   65|
|Aryan|   55|
| Isha|   88|
|Tanvi|   91|
|Kunal|   72|
|Megha|   60|
|Rohan|   40|
+-----+-----+



2. Count number of students in each subject.

In [None]:
spark.sql("""SELECT subject, COUNT(*) AS student_count FROM student_scores GROUP BY subject""").show()

+-------+-------------+
|subject|student_count|
+-------+-------------+
|Science|            3|
|   Math|            4|
|English|            3|
+-------+-------------+



3. Find average score per subject.

In [None]:
spark.sql("""SELECT subject, ROUND(AVG(score), 2) AS avg_score FROM student_scores GROUP BY subject""").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     73.0|
|   Math|    70.25|
|English|    75.33|
+-------+---------+



4. List all students who scored more than 80.

In [None]:
spark.sql("SELECT * FROM student_scores WHERE score > 80").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
+----------+-----+-------+-----+-----+



#Advanced Queries

5. Show student(s) with the highest score in each subject.

In [None]:
spark.sql("""SELECT subject, name, score FROM (SELECT *, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rnk FROM student_scores) WHERE rnk = 1""").show()

+-------+-----+-----+
|subject| name|score|
+-------+-----+-----+
|English| Isha|   88|
|   Math|Tanvi|   91|
|Science|Divya|   92|
+-------+-----+-----+



6. Display grades with count of students in each.

In [None]:
spark.sql("""SELECT grade, COUNT(*) AS student_count FROM student_scores GROUP BY grade""").show()

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    F|            1|
|    B|            2|
|    D|            1|
|    C|            2|
|    A|            4|
+-----+-------------+



7. Show names of students who failed (grade F).

In [None]:
spark.sql("""SELECT * FROM student_scores WHERE grade = 'F'""").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



8. List students with score between 60 and 90.

In [None]:
spark.sql("""SELECT * FROM student_scores where score BETWEEN 60 AND 90""").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         6| Isha|English|   88|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
+----------+-----+-------+-----+-----+



9. Rank students within each subject based on scores.

In [None]:
spark.sql("""SELECT subject, name, score, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rnk FROM student_scores""").show()

+-------+-----+-----+---+
|subject| name|score|rnk|
+-------+-----+-----+---+
|English| Isha|   88|  1|
|English|Rahul|   78|  2|
|English|Megha|   60|  3|
|   Math|Tanvi|   91|  1|
|   Math|Ankit|   85|  2|
|   Math|Sneha|   65|  3|
|   Math|Rohan|   40|  4|
|Science|Divya|   92|  1|
|Science|Kunal|   72|  2|
|Science|Aryan|   55|  3|
+-------+-----+-----+---+



#Update & Delete Tasks

10. Increase score of all English subject students by 5.

In [None]:
spark.sql("""UPDATE student_scores SET score = score+5 WHERE subject = 'English'""")
spark.sql("""SELECT * FROM student_scores""").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|        10|Rohan|   Math|   40|    F|
|         3|Rahul|English|   88|    B|
|         6| Isha|English|   98|    A|
|         9|Megha|English|   70|    C|
+----------+-----+-------+-----+-----+



11. Delete all records where score is less than 50.

In [None]:
spark.sql("""DELETE FROM student_scores WHERE score < 50""")

DataFrame[num_affected_rows: bigint]

In [None]:
spark.sql("""SELECT * FROM student_scores""").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         3|Rahul|English|   88|    B|
|         6| Isha|English|   98|    A|
|         9|Megha|English|   70|    C|
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
+----------+-----+-------+-----+-----+



12. Add a new column pass_status (PASS if score >= 50 else FAIL).

In [None]:
spark.sql("""SELECT *, CASE WHEN score >= 50 THEN 'PASS' ELSE 'FAIL' END AS pass_status FROM student_scores;""").show()

+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         3|Rahul|English|   88|    B|       PASS|
|         6| Isha|English|   98|    A|       PASS|
|         9|Megha|English|   70|    C|       PASS|
|         1|Ankit|   Math|   85|    A|       PASS|
|         2|Divya|Science|   92|    A|       PASS|
|         4|Sneha|   Math|   65|    C|       PASS|
|         5|Aryan|Science|   55|    D|       PASS|
|         7|Tanvi|   Math|   91|    A|       PASS|
|         8|Kunal|Science|   72|    B|       PASS|
+----------+-----+-------+-----+-----+-----------+



#Data Transformation & Views

13. Create a temporary view and run SQL to get average scores.

In [None]:
spark.sql("""SELECT subject, ROUND(AVG(score), 2) AS avg_score FROM student_scores GROUP BY subject""").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     73.0|
|   Math|    80.33|
|English|    85.33|
+-------+---------+



14. Convert updated DataFrame into a new Delta table called student_scores_v2 .

In [None]:
spark.sql("""CREATE OR REPLACE TABLE student_scores_v2 USING DELTA AS SELECT * FROM student_scores""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

15. Write the final data to Parquet and JSON formats.

In [None]:
df_final = spark.table("student_scores")
df_final.write.mode("overwrite").parquet("/tmp/parquet/student_scores")
df_final.write.mode("overwrite").json("/tmp/json/student_scores")