In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schema for Players table
players_schema = StructType([
    StructField("player_id", IntegerType(), True),
    StructField("group_id", IntegerType(), True)
])

# Define schema for Matches table
matches_schema = StructType([
    StructField("match_id", IntegerType(), True),
    StructField("first_player", IntegerType(), True),
    StructField("second_player", IntegerType(), True),
    StructField("first_score", IntegerType(), True),
    StructField("second_score", IntegerType(), True)
])

# Create Players DataFrame
players_data = [
    (15, 1), (25, 1), (30, 1), (45, 1),
    (10, 2), (35, 2), (50, 2),
    (20, 3), (40, 3)
]

players_df = spark.createDataFrame(players_data, schema=players_schema)

# Create Matches DataFrame
matches_data = [
    (1, 15, 45, 3, 0),
    (2, 30, 25, 1, 2),
    (3, 30, 15, 2, 0),
    (4, 40, 20, 5, 2),
    (5, 35, 50, 1, 1)
]

matches_df = spark.createDataFrame(matches_data, schema=matches_schema)

# Create temporary views for SQL queries
players_df.createOrReplaceTempView("Players")
matches_df.createOrReplaceTempView("Matches")

print("Players and Matches tables and views created successfully.")


Players and Matches tables and views created successfully.


In [11]:
spark.sql("""
with player_scores as (
    select first_player as player_id, first_score as score from Matches
    union all
    select second_player as player_id, second_score as score from Matches
    ),
    
final_table as (select p.player_id, p.group_id, sum(score) as score
from player_scores ps inner join Players p on ps.player_id = p.player_id
group by p.player_id, p.group_id)

select * from  (select *,
rank() over(partition by group_id order by score desc, player_id asc) as rn
from final_table) where rn = 1

""").show()

+---------+--------+-----+---+
|player_id|group_id|score| rn|
+---------+--------+-----+---+
|       15|       1|    3|  1|
|       35|       2|    1|  1|
|       40|       3|    5|  1|
+---------+--------+-----+---+



In [12]:
# ---------------- Step 1: Create `player_scores` ----------------
player_scores_df = (
    matches_df.select(col("first_player").alias("player_id"), col("first_score").alias("score"))
    .unionAll(
        matches_df.select(col("second_player").alias("player_id"), col("second_score").alias("score"))
    )
)

# ---------------- Step 2: Compute total scores per player in `final_table` ----------------
final_table_df = (
    player_scores_df.alias("ps")
    .join(players_df.alias("p"), col("ps.player_id") == col("p.player_id"), "inner")
    .groupBy(col("p.player_id"), col("p.group_id"))
    .agg(sum("score").alias("score"))
)

# ---------------- Step 3: Apply Ranking and Filter Top Players Per Group ----------------
window_spec = Window.partitionBy("group_id").orderBy(col("score").desc(), col("player_id").asc())

ranked_df = final_table_df.withColumn("rn", rank().over(window_spec)).filter(col("rn") == 1)

# ---------------- Show Result ----------------
ranked_df.show()

+---------+--------+-----+---+
|player_id|group_id|score| rn|
+---------+--------+-----+---+
|       15|       1|    3|  1|
|       35|       2|    1|  1|
|       40|       3|    5|  1|
+---------+--------+-----+---+

