In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
# Initialize Spark Session
spark = SparkSession.builder.appName("Exams").getOrCreate()

# Define Data
data = [
    (1, 'Chemistry', 91), (1, 'Physics', 91),
    (2, 'Chemistry', 80), (2, 'Physics', 90),
    (3, 'Chemistry', 80),
    (4, 'Chemistry', 71), (4, 'Physics', 54)
]

columns = ["student_id", "subject", "marks"]

# Create DataFrame
exams_df = spark.createDataFrame(data, columns)

# Show Data
exams_df.show()

exams_df.createOrReplaceTempView("exams")



+----------+---------+-----+
|student_id|  subject|marks|
+----------+---------+-----+
|         1|Chemistry|   91|
|         1|  Physics|   91|
|         2|Chemistry|   80|
|         2|  Physics|   90|
|         3|Chemistry|   80|
|         4|Chemistry|   71|
|         4|  Physics|   54|
+----------+---------+-----+



In [10]:
spark.sql(
"""
    select student_id, count(subject) as s1 from exams
    where subject in ('Chemistry', 'Physics')
    group by student_id
    having s1 = 2 and count(distinct marks) = 1
""").show()

+----------+---+
|student_id| s1|
+----------+---+
|         1|  2|
+----------+---+



In [8]:

filtered_df = exams_df.filter(col("subject").isin(["Chemistry", "Physics"]))

# Grouping and applying conditions
result_df = (
    filtered_df.groupBy("student_id")
    .agg(
        countDistinct("subject").alias("distinct_subject_count"),
        countDistinct("marks").alias("distinct_marks_count")
    )
    .filter((col("distinct_subject_count") == 1) & (col("distinct_marks_count") == 1))
    .select("student_id")
)
