In [0]:

ipl_df = spark.read.parquet("dbfs:/FileStore/ipl_validated.parquet/")

ipl_df.createOrReplaceTempView("ipl")


In [0]:
%sql

SELECT 
  year, 
  COUNT(DISTINCT match_team1) + COUNT(DISTINCT match_team2) AS total_teams
FROM ipl
GROUP BY year
ORDER BY year;


year,total_teams


In [0]:
ipl_df.printSchema()
ipl_df.select("*").show(5, truncate=False)


root
 |-- year: integer (nullable = true)
 |-- series_type: string (nullable = true)
 |-- series_name: string (nullable = true)
 |-- match_no: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- match_name: string (nullable = true)
 |-- match_href: string (nullable = true)
 |-- match_team1: string (nullable = true)
 |-- match_team2: string (nullable = true)
 |-- match_datetime_start: string (nullable = true)
 |-- match_date_end: string (nullable = true)
 |-- match_venue: string (nullable = true)

+----+-----------+-----------+--------+----------+----------+----------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type|series_name|match_no|match_type|match_name|match_href|match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-----------+-----------+--------+----------+----------+----------+-----------+-----------+--------------------+--------------+-----------+
+----+-----------+-----------+--------+----

In [0]:
%sql
SELECT 
  year, 
  COUNT(DISTINCT match_team1) + COUNT(DISTINCT match_team2) AS total_teams
FROM ipl
WHERE match_team1 IS NOT NULL AND match_team2 IS NOT NULL AND year IS NOT NULL
GROUP BY year
ORDER BY year;


year,total_teams


In [0]:
%sql

SELECT 
  year, 
  team
FROM (
  SELECT year, match_team1 AS team FROM ipl
  UNION ALL
  SELECT year, match_team2 AS team FROM ipl
) AS all_teams
WHERE team IS NOT NULL
GROUP BY year, team
ORDER BY year;


year,team


In [0]:
%sql
SELECT 
  year, 
  COUNT(DISTINCT team) AS total_teams
FROM (
  SELECT year, match_team1 AS team FROM ipl
  UNION ALL
  SELECT year, match_team2 AS team FROM ipl
) AS all_teams
WHERE team IS NOT NULL
GROUP BY year
ORDER BY year;


year,total_teams


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

data = [
    (2021, "CSK", "RCB", "completed", "CSK"),
    (2021, "MI", "KKR", "completed", "MI"),
    (2021, "SRH", "DC", "tied", None),
    (2022, "RR", "PBKS", "completed", "PBKS"),
    (2022, "GT", "LSG", "abandoned", None),
    (2023, "CSK", "MI", "completed", "MI"),
    (2023, "RCB", "RR", "completed", "RCB")
]

schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("match_team1", StringType(), True),
    StructField("match_team2", StringType(), True),
    StructField("result", StringType(), True),
    StructField("winner", StringType(), True),
])

ipl_df = spark.createDataFrame(data, schema)
ipl_df.createOrReplaceTempView("ipl")
ipl_df.show()


+----+-----------+-----------+---------+------+
|year|match_team1|match_team2|   result|winner|
+----+-----------+-----------+---------+------+
|2021|        CSK|        RCB|completed|   CSK|
|2021|         MI|        KKR|completed|    MI|
|2021|        SRH|         DC|     tied|  null|
|2022|         RR|       PBKS|completed|  PBKS|
|2022|         GT|        LSG|abandoned|  null|
|2023|        CSK|         MI|completed|    MI|
|2023|        RCB|         RR|completed|   RCB|
+----+-----------+-----------+---------+------+



In [0]:
%sql
SELECT 
  year, 
  COUNT(DISTINCT team) AS total_teams
FROM (
  SELECT year, match_team1 AS team FROM ipl
  UNION
  SELECT year, match_team2 AS team FROM ipl
) AS all_teams
GROUP BY year
ORDER BY year;


year,total_teams
2021,6
2022,4
2023,4


In [0]:
%sql
SELECT 
  result,
  COUNT(*) AS total_matches
FROM ipl
GROUP BY result;


result,total_matches
completed,5
tied,1
abandoned,1


In [0]:
ball_data = [
    (1, 1, 1, 1),
    (1, 1, 1, 2),
    (1, 1, 2, 1),
    (2, 2, 1, 1),
    (2, 2, 1, 2),
    (2, 2, 2, 1),
    (3, 3, 1, 1),
    (3, 3, 1, 2),
    (3, 3, 2, 1),
]

ball_schema = StructType([
    StructField("match_id", IntegerType(), True),
    StructField("ball_no", IntegerType(), True),
    StructField("inning", IntegerType(), True),
    StructField("over_no", IntegerType(), True),
])

ball_df = spark.createDataFrame(ball_data, ball_schema)
ball_df.createOrReplaceTempView("balls")
ball_df.show()


+--------+-------+------+-------+
|match_id|ball_no|inning|over_no|
+--------+-------+------+-------+
|       1|      1|     1|      1|
|       1|      1|     1|      2|
|       1|      1|     2|      1|
|       2|      2|     1|      1|
|       2|      2|     1|      2|
|       2|      2|     2|      1|
|       3|      3|     1|      1|
|       3|      3|     1|      2|
|       3|      3|     2|      1|
+--------+-------+------+-------+



In [0]:
%sql
SELECT 
  inning,
  ROUND(AVG(ball_count), 2) AS avg_balls_per_match
FROM (
  SELECT match_id, inning, COUNT(*) AS ball_count
  FROM balls
  GROUP BY match_id, inning
)
GROUP BY inning;


inning,avg_balls_per_match
1,2.0
2,1.0


In [0]:
%sql
SELECT 
  year,
  winner AS team,
  COUNT(*) AS matches_won
FROM ipl
WHERE winner IS NOT NULL
GROUP BY year, winner
ORDER BY year, matches_won DESC;


year,team,matches_won
2021,CSK,1
2021,MI,1
2022,PBKS,1
2023,MI,1
2023,RCB,1


In [0]:

df = spark.read.parquet("dbfs:/FileStore/ipl_final_cleaned.parquet/")

df.createOrReplaceTempView("ipl")

spark.sql("""
SELECT 
  year,
  COUNT(DISTINCT team) AS total_teams
FROM (
  SELECT year, match_team1 AS team FROM ipl
  UNION
  SELECT year, match_team2 AS team FROM ipl
) all_teams
GROUP BY year
ORDER BY year
""").show()

spark.sql("""
SELECT 
  match_type AS result_type,
  COUNT(*) AS match_count
FROM ipl
GROUP BY match_type
""").show()

spark.sql("""
SELECT 
  year,
  match_team1 AS team,
  COUNT(*) AS matches_played
FROM ipl
GROUP BY year, match_team1
UNION
SELECT 
  year,
  match_team2 AS team,
  COUNT(*) AS matches_played
FROM ipl
GROUP BY year, match_team2
ORDER BY team, year
""").show()


+----+-----------+
|year|total_teams|
+----+-----------+
|2021|          8|
+----+-----------+

+-----------+-----------+
|result_type|match_count|
+-----------+-----------+
|     League|         62|
+-----------+-----------+

+----+--------------------+--------------+
|year|                team|matches_played|
+----+--------------------+--------------+
|2021| CHENNAI SUPER KINGS|             8|
|2021|      DELHI CAPITALS|             9|
|2021|      DELHI CAPITALS|             7|
|2021|KOLKATA KNIGHT RI...|             8|
|2021|KOLKATA KNIGHT RI...|            10|
|2021|      MUMBAI INDIANS|             7|
|2021|      MUMBAI INDIANS|             8|
|2021|        PUNJAB KINGS|             7|
|2021|    RAJASTHAN ROYALS|             7|
|2021|ROYAL CHALLENGERS...|             8|
|2021| SUNRISERS HYDERABAD|             8|
|2021| SUNRISERS HYDERABAD|             7|
+----+--------------------+--------------+



In [0]:
%sql
SELECT 
  year,
  team,
  SUM(matches_played) AS total_matches
FROM (
  SELECT year, match_team1 AS team, COUNT(*) AS matches_played
  FROM ipl
  GROUP BY year, match_team1

  UNION ALL

  SELECT year, match_team2 AS team, COUNT(*) AS matches_played
  FROM ipl
  GROUP BY year, match_team2
) combined
GROUP BY year, team
ORDER BY year, total_matches DESC


year,team,total_matches
2021,KOLKATA KNIGHT RIDERS,18
2021,ROYAL CHALLENGERS BENGALURU,16
2021,CHENNAI SUPER KINGS,16
2021,DELHI CAPITALS,16
2021,SUNRISERS HYDERABAD,15
2021,MUMBAI INDIANS,15
2021,RAJASTHAN ROYALS,14
2021,PUNJAB KINGS,14
