In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.window import Window
# Initialize Spark Session
spark = SparkSession.builder.appName("StudentsTable").getOrCreate()

# Define Schema for Students Table
students_schema = StructType([
    StructField("studentid", IntegerType(), True),
    StructField("studentname", StringType(), True),
    StructField("subject", StringType(), True),
    StructField("marks", IntegerType(), True),
    StructField("testid", IntegerType(), True),
    StructField("testdate", DateType(), True)
])

# Define Data for Students Table
students_data = [
    (2, 'Max Ruin', 'Subject1', 63, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (3, 'Arnold', 'Subject1', 95, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (4, 'Krish Star', 'Subject1', 61, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (5, 'John Mike', 'Subject1', 91, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (4, 'Krish Star', 'Subject2', 71, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (3, 'Arnold', 'Subject2', 32, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (5, 'John Mike', 'Subject2', 61, 2, datetime.strptime('2022-11-02', '%Y-%m-%d')),
    (1, 'John Deo', 'Subject2', 60, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (2, 'Max Ruin', 'Subject2', 84, 1, datetime.strptime('2022-01-02', '%Y-%m-%d')),
    (2, 'Max Ruin', 'Subject3', 29, 3, datetime.strptime('2022-01-03', '%Y-%m-%d')),
    (5, 'John Mike', 'Subject3', 98, 2, datetime.strptime('2022-11-02', '%Y-%m-%d'))
]

# Create Students DataFrame
students_df = spark.createDataFrame(students_data, schema=students_schema)

# Show DataFrame
students_df.createOrReplaceTempView("Students")


In [2]:
students_df.show()

+---------+-----------+--------+-----+------+----------+
|studentid|studentname| subject|marks|testid|  testdate|
+---------+-----------+--------+-----+------+----------+
|        2|   Max Ruin|Subject1|   63|     1|2022-01-02|
|        3|     Arnold|Subject1|   95|     1|2022-01-02|
|        4| Krish Star|Subject1|   61|     1|2022-01-02|
|        5|  John Mike|Subject1|   91|     1|2022-01-02|
|        4| Krish Star|Subject2|   71|     1|2022-01-02|
|        3|     Arnold|Subject2|   32|     1|2022-01-02|
|        5|  John Mike|Subject2|   61|     2|2022-11-02|
|        1|   John Deo|Subject2|   60|     1|2022-01-02|
|        2|   Max Ruin|Subject2|   84|     1|2022-01-02|
|        2|   Max Ruin|Subject3|   29|     3|2022-01-03|
|        5|  John Mike|Subject3|   98|     2|2022-11-02|
+---------+-----------+--------+-----+------+----------+



In [7]:
spark.sql(
"""
    with cte as (select *,
        avg(marks) over(order by studentid rows between unbounded preceding and unbounded following) as avg_marks
    from Students)
    
    select distinct studentname from cte
    where marks > avg_marks
    
""").show()

+-----------+
|studentname|
+-----------+
|   Max Ruin|
|     Arnold|
| Krish Star|
|  John Mike|
+-----------+



In [None]:
spark.sql(
"""
    with cte as (select *,
        avg(marks) over(order by studentid rows between unbounded preceding and unbounded following) as avg_marks
    from Students)
    
    select distinct studentname from cte
    where marks > avg_marks
    
""").show()

In [19]:
spark.sql(
"""
    select 
    count(distinct case when marks > 90 then studentid else null end)/count(distinct studentid)
    from students
""").show()

+-------------------------------------------------------------------------------------------------+
|(count(DISTINCT CASE WHEN (marks > 90) THEN studentid ELSE NULL END) / count(DISTINCT studentid))|
+-------------------------------------------------------------------------------------------------+
|                                                                                              0.4|
+-------------------------------------------------------------------------------------------------+



In [30]:
spark.sql(
"""
    with cte as (select subject, marks,
    rank(marks) over(partition by subject order by marks) as rank1,
    rank(marks) over(partition by subject order by marks desc) as rank2
    from students)
    
    select subject, 
        sum(case when rank1 = 2 then marks else null end),
        sum(case when rank2 = 2 then marks else null end)
    from cte
    group by subject

""").show()

+--------+---------------------------------------------------+---------------------------------------------------+
| subject|sum(CASE WHEN (rank1 = 2) THEN marks ELSE NULL END)|sum(CASE WHEN (rank2 = 2) THEN marks ELSE NULL END)|
+--------+---------------------------------------------------+---------------------------------------------------+
|Subject1|                                                 63|                                                 91|
|Subject2|                                                 60|                                                 71|
|Subject3|                                                 98|                                                 29|
+--------+---------------------------------------------------+---------------------------------------------------+

