In [0]:
from pyspark.sql.functions import col, year, countDistinct, count, avg, collect_list, collect_set, when
from pyspark.sql import functions as F

df = spark.read.parquet("dbfs:/FileStore/ipl_validated.parquet/")

# 1
teams_each_year = df.select("year", "match_team1").withColumnRenamed("match_team1", "team") \
    .union(df.select("year", col("match_team2").alias("team"))) \
    .distinct().groupBy("year").agg(countDistinct("team").alias("no_of_teams"))
teams_each_year.show()

# 2
status_df = df.withColumn("status", 
    when(col("match_venue").isNull(), "Abandoned")
    .when(col("match_name").like("%Tied%"), "Tied")
    .otherwise("Completed"))
status_counts = status_df.groupBy("status").count()
status_counts.show()

# 3
from pyspark.sql import Row

sample_ball_data = [
    Row(match_id=1, inning=1, ball_no=1),
    Row(match_id=1, inning=1, ball_no=2),
    Row(match_id=1, inning=2, ball_no=1),
    Row(match_id=2, inning=1, ball_no=1),
    Row(match_id=2, inning=1, ball_no=2),
    Row(match_id=2, inning=1, ball_no=3),
]
ball_df = spark.createDataFrame(sample_ball_data)

avg_ball_per_inning = ball_df.groupBy("match_id", "inning").agg(count("ball_no").alias("balls")) \
    .groupBy("inning").agg(avg("balls").alias("avg_balls"))
avg_ball_per_inning.show()

#4
df_winners = df.withColumn("match_winner", col("match_team1")
matches_won = df_winners.groupBy("year", "match_winner").count().orderBy("year", "count", ascending=[True, False])
matches_won.show()

# 5
sample_commentary = [
    Row(match_id=1, team="Team A", batsman="Player 1"),
    Row(match_id=1, team="Team A", batsman="Player 2"),
    Row(match_id=1, team="Team A", batsman="Player 1"),  # duplicate
    Row(match_id=1, team="Team B", batsman="Player 3"),
    Row(match_id=1, team="Team B", batsman="Player 4")
]
commentary_df = spark.createDataFrame(sample_commentary)

batsmen_list = commentary_df.groupBy("match_id", "team") \
    .agg(
        collect_list("batsman").alias("batsmen_list"),
        collect_set("batsman").alias("batsmen_set")
    )
batsmen_list.show(truncate=False)


+----+-----------+
|year|no_of_teams|
+----+-----------+
+----+-----------+

+------+-----+
|status|count|
+------+-----+
+------+-----+

+------+---------+
|inning|avg_balls|
+------+---------+
|     1|      2.5|
|     2|      1.0|
+------+---------+

+----+------------+-----+
|year|match_winner|count|
+----+------------+-----+
+----+------------+-----+

+--------+------+------------------------------+--------------------+
|match_id|team  |batsmen_list                  |batsmen_set         |
+--------+------+------------------------------+--------------------+
|1       |Team A|[Player 1, Player 2, Player 1]|[Player 2, Player 1]|
|1       |Team B|[Player 3, Player 4]          |[Player 4, Player 3]|
+--------+------+------------------------------+--------------------+

