In [0]:
ipl_cleaned_df = spark.read.parquet("/FileStore/tables/ipl_cleaned.parquet")
ipl_cleaned_df.show(5)


+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|year|        series_type|         series_name|match_no|match_type|          match_name|          match_href|match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|MUMBAI INDIANS vs...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|CHENNAI SUPER KIN...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|SUNRISERS HYDERAB...|ht

In [0]:
from pyspark.sql.functions import col

teams_df = ipl_cleaned_df.select("match_team1").union(
    ipl_cleaned_df.select("match_team2")
).distinct()

teams_df.show(truncate=False)
print("Number of unique teams:", teams_df.count())


+-----------+
|match_team1|
+-----------+
|null       |
+-----------+

Number of unique teams: 1


In [0]:
from pyspark.sql.functions import split, trim

ipl_cleaned_df = ipl_cleaned_df.withColumn("match_team1", trim(split("match_name", " vs ")[0]))
ipl_cleaned_df = ipl_cleaned_df.withColumn("match_team2", trim(split(split("match_name", " vs ")[1], ",")[0]))


In [0]:
ipl_cleaned_df.select("match_team1", "match_team2", "match_name").show(5, truncate=False)


+---------------------+---------------------------+--------------------------------------------------------+
|match_team1          |match_team2                |match_name                                              |
+---------------------+---------------------------+--------------------------------------------------------+
|MUMBAI INDIANS       |ROYAL CHALLENGERS BENGALURU|MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match|
|CHENNAI SUPER KINGS  |DELHI CAPITALS             |CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match        |
|SUNRISERS HYDERABAD  |KOLKATA KNIGHT RIDERS      |SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, 3rd Match |
|RAJASTHAN ROYALS     |PUNJAB KINGS               |RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match             |
|KOLKATA KNIGHT RIDERS|MUMBAI INDIANS             |KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th Match      |
+---------------------+---------------------------+--------------------------------------------------------+
only showing top 5 

In [0]:
teams_df = ipl_cleaned_df.select("match_team1").union(
    ipl_cleaned_df.select("match_team2")
).distinct()

teams_df.show(truncate=False)
print("Number of unique teams:", teams_df.count())


+---------------------------+
|match_team1                |
+---------------------------+
|CHENNAI SUPER KINGS        |
|ROYAL CHALLENGERS BENGALURU|
|RAJASTHAN ROYALS           |
|SUNRISERS HYDERABAD        |
|DELHI CAPITALS             |
|MUMBAI INDIANS             |
|KOLKATA KNIGHT RIDERS      |
|PUNJAB KINGS               |
+---------------------------+

Number of unique teams: 8


In [0]:
ipl_cleaned_df.select("match_type").distinct().show(truncate=False)


+----------+
|match_type|
+----------+
|League    |
+----------+



In [0]:
ipl_cleaned_df.filter(ipl_cleaned_df.match_name.contains("Super Over")).select("match_name").show(truncate=False)


+----------+
|match_name|
+----------+
+----------+



In [0]:
ipl_cleaned_df.filter(ipl_cleaned_df.match_name.contains("Super Over")).select("match_name", "match_type").show(truncate=False)


+----------+----------+
|match_name|match_type|
+----------+----------+
+----------+----------+



In [0]:
completed_matches_df = ipl_cleaned_df.filter(ipl_cleaned_df["match_datetime_start"].isNotNull())
completed_matches_df.show(5)


+----+-----------+-----------+--------+----------+----------+----------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type|series_name|match_no|match_type|match_name|match_href|match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-----------+-----------+--------+----------+----------+----------+-----------+-----------+--------------------+--------------+-----------+
+----+-----------+-----------+--------+----------+----------+----------+-----------+-----------+--------------------+--------------+-----------+



In [0]:
ipl_cleaned_df.select("match_name", "match_datetime_start").show(10, truncate=False)


+----------------------------------------------------------------+--------------------+
|match_name                                                      |match_datetime_start|
+----------------------------------------------------------------+--------------------+
|MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match        |null                |
|CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match                |null                |
|SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, 3rd Match         |null                |
|RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match                     |null                |
|KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th Match              |null                |
|SUNRISERS HYDERABAD vs ROYAL CHALLENGERS BENGALURU, 6th Match   |null                |
|RAJASTHAN ROYALS vs DELHI CAPITALS, 7th Match                   |null                |
|PUNJAB KINGS vs CHENNAI SUPER KINGS, 8th Match                  |null                |
|MUMBAI INDIANS vs SUNRISERS HYD

In [0]:
from pyspark.sql.functions import col

teams_df = ipl_cleaned_df.select("match_team1").union(
    ipl_cleaned_df.select("match_team2")
).distinct().filter(col("match_team1").isNotNull())

teams_df.show(truncate=False)
print("Number of unique teams:", teams_df.count())


+---------------------------+
|match_team1                |
+---------------------------+
|CHENNAI SUPER KINGS        |
|ROYAL CHALLENGERS BENGALURU|
|RAJASTHAN ROYALS           |
|SUNRISERS HYDERABAD        |
|DELHI CAPITALS             |
|MUMBAI INDIANS             |
|KOLKATA KNIGHT RIDERS      |
|PUNJAB KINGS               |
+---------------------------+

Number of unique teams: 8


In [0]:
ipl_cleaned_df.select("match_name").filter(col("match_name").contains("Super Over")).show(truncate=False)


+----------+
|match_name|
+----------+
+----------+



In [0]:
ipl_cleaned_df.select("match_name").filter(
    col("match_name").rlike("Super Over|tie|Tie")
).show(truncate=False)


+----------+
|match_name|
+----------+
+----------+



In [0]:
ipl_df.select("match_name").show(10, truncate=False)


+----------------------------------------------------------------+
|match_name                                                      |
+----------------------------------------------------------------+
|MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match        |
|CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match                |
|SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, 3rd Match         |
|RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match                     |
|KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th Match              |
|SUNRISERS HYDERABAD vs ROYAL CHALLENGERS BENGALURU, 6th Match   |
|RAJASTHAN ROYALS vs DELHI CAPITALS, 7th Match                   |
|PUNJAB KINGS vs CHENNAI SUPER KINGS, 8th Match                  |
|MUMBAI INDIANS vs SUNRISERS HYDERABAD, 9th Match                |
|ROYAL CHALLENGERS BENGALURU vs KOLKATA KNIGHT RIDERS, 10th Match|
+----------------------------------------------------------------+
only showing top 10 rows



In [0]:
# Check for matches with "Super Over" or "Tie" in the match_name
ipl_df.filter(
    (ipl_df.match_name.contains("Super Over")) | (ipl_df.match_name.contains("Tie"))
).select("match_name").show(truncate=False)


+----------+
|match_name|
+----------+
+----------+



In [0]:
completed_matches_df = ipl_df.filter(ipl_df.match_date_end.isNotNull())
completed_matches_df.select("match_name", "match_date_end").show(5, truncate=False)



+----------+--------------+
|match_name|match_date_end|
+----------+--------------+
+----------+--------------+



In [0]:
columns_to_drop = ["series_type", "match_href", "match_datetime_start"]
cleaned_df = completed_matches_df.drop(*columns_to_drop)
cleaned_df.printSchema()


root
 |-- year: integer (nullable = true)
 |-- series_name: string (nullable = true)
 |-- match_no: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- match_name: string (nullable = true)
 |-- match_team1: string (nullable = true)
 |-- match_team2: string (nullable = true)
 |-- match_date_end: string (nullable = true)
 |-- match_venue: string (nullable = true)



In [0]:
from pyspark.sql.functions import col

team1 = cleaned_df.select(col("match_team1")).distinct()
team2 = cleaned_df.select(col("match_team2")).distinct()

all_teams = team1.union(team2).distinct()
print("Number of Unique Teams:", all_teams.count())

all_teams.show(truncate=False)


Number of Unique Teams: 0
+-----------+
|match_team1|
+-----------+
+-----------+



In [0]:
from pyspark.sql.functions import split, trim

ipl_df = ipl_df.withColumn("match_team1", trim(split("match_name", " vs ")[0]))
ipl_df = ipl_df.withColumn("match_team2", trim(split(split("match_name", " vs ")[1], ",")[0]))

ipl_df.select("match_name", "match_team1", "match_team2").show(5, truncate=False)



+--------------------------------------------------------+---------------------+---------------------------+
|match_name                                              |match_team1          |match_team2                |
+--------------------------------------------------------+---------------------+---------------------------+
|MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match|MUMBAI INDIANS       |ROYAL CHALLENGERS BENGALURU|
|CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match        |CHENNAI SUPER KINGS  |DELHI CAPITALS             |
|SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, 3rd Match |SUNRISERS HYDERABAD  |KOLKATA KNIGHT RIDERS      |
|RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match             |RAJASTHAN ROYALS     |PUNJAB KINGS               |
|KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th Match      |KOLKATA KNIGHT RIDERS|MUMBAI INDIANS             |
+--------------------------------------------------------+---------------------+---------------------------+
only showing top 5 

In [0]:
completed_matches_df = ipl_df.filter(ipl_df.match_date_end.isNotNull())


In [0]:
team1 = completed_matches_df.select("match_team1").distinct()
team2 = completed_matches_df.select("match_team2").distinct()
all_teams = team1.union(team2).distinct()

print("Number of Unique Teams:", all_teams.count())
all_teams.show(truncate=False)


Number of Unique Teams: 0
+-----------+
|match_team1|
+-----------+
+-----------+



In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import split, trim, col

schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("series_type", TimestampType(), True),
    StructField("series_name", StringType(), True),
    StructField("match_no", StringType(), True),
    StructField("match_type", StringType(), True),
    StructField("match_name", StringType(), True),
    StructField("match_href", StringType(), True),
    StructField("match_team1", StringType(), True),
    StructField("match_team2", StringType(), True),
    StructField("match_datetime_start", StringType(), True),
    StructField("match_date_end", StringType(), True),
    StructField("match_venue", StringType(), True)
])

ipl_df = spark.read.option("header", True).schema(schema).csv("/FileStore/tables/ipl_2021_matches.csv")

ipl_df = ipl_df.withColumn("match_team1", trim(split("match_name", " vs ")[0]))
ipl_df = ipl_df.withColumn("match_team2", trim(split(split("match_name", " vs ")[1], ",")[0]))

ipl_df.select("match_name", "match_team1", "match_team2").show(5, truncate=False)


+--------------------------------------------------------+---------------------+---------------------------+
|match_name                                              |match_team1          |match_team2                |
+--------------------------------------------------------+---------------------+---------------------------+
|MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match|MUMBAI INDIANS       |ROYAL CHALLENGERS BENGALURU|
|CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match        |CHENNAI SUPER KINGS  |DELHI CAPITALS             |
|SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, 3rd Match |SUNRISERS HYDERABAD  |KOLKATA KNIGHT RIDERS      |
|RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match             |RAJASTHAN ROYALS     |PUNJAB KINGS               |
|KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th Match      |KOLKATA KNIGHT RIDERS|MUMBAI INDIANS             |
+--------------------------------------------------------+---------------------+---------------------------+
only showing top 5 

In [0]:
teams_df = ipl_df.select("match_team1").union(
    ipl_df.select("match_team2")
).filter(col("match_team1").isNotNull()).distinct()

teams_df.show(truncate=False)
print("Number of unique teams:", teams_df.count())


+---------------------------+
|match_team1                |
+---------------------------+
|CHENNAI SUPER KINGS        |
|ROYAL CHALLENGERS BENGALURU|
|RAJASTHAN ROYALS           |
|SUNRISERS HYDERABAD        |
|DELHI CAPITALS             |
|MUMBAI INDIANS             |
|KOLKATA KNIGHT RIDERS      |
|PUNJAB KINGS               |
+---------------------------+

Number of unique teams: 8


In [0]:
ipl_df.select("match_name").filter(
    col("match_name").rlike("(?i)Super Over|Tie")
).show(truncate=False)


+----------+
|match_name|
+----------+
+----------+



In [0]:
whos

Variable               Type                 Data/Info
-----------------------------------------------------
ArrayType              type                 <class 'pyspark.sql.types.ArrayType'>
BinaryType             DataTypeSingleton    <class 'pyspark.sql.types.BinaryType'>
BooleanType            DataTypeSingleton    <class 'pyspark.sql.types.BooleanType'>
ByteType               DataTypeSingleton    <class 'pyspark.sql.types.ByteType'>
CharType               type                 <class 'pyspark.sql.types.CharType'>
DataType               type                 <class 'pyspark.sql.types.DataType'>
DateType               DataTypeSingleton    <class 'pyspark.sql.types.DateType'>
DayTimeIntervalType    type                 <class 'pyspark.sql.types.DayTimeIntervalType'>
DecimalType            type                 <class 'pyspark.sql.types.DecimalType'>
DoubleType             DataTypeSingleton    <class 'pyspark.sql.types.DoubleType'>
FloatType              DataTypeSingleton    <class 'pyspark.

In [0]:
display(dbutils.fs.ls("/FileStore"))



path,name,size,modificationTime
dbfs:/FileStore/ipl_cleaned.parquet/,ipl_cleaned.parquet/,0,0
dbfs:/FileStore/ipl_final_cleaned.parquet/,ipl_final_cleaned.parquet/,0,0
dbfs:/FileStore/ipl_validated.parquet/,ipl_validated.parquet/,0,0
dbfs:/FileStore/tables/,tables/,0,0


In [0]:
df = spark.read.parquet("/FileStore/ipl_final_cleaned.parquet")
df.show(3)


+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------+-----------+
|year|        series_type|         series_name|match_no|match_type|          match_name|          match_href|        match_team1|         match_team2|match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------+-----------+
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|MUMBAI INDIANS vs...|https://www.cricb...|     MUMBAI INDIANS|ROYAL CHALLENGERS...| 2025-04-09 20:00:00|          null|       null|
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|CHENNAI SUPER KIN...|https://www.cricb...|CHENNAI SUPER KINGS|      DELHI CAPITALS| 2025-04-09 20:00:00|          null|       null|
|2021

In [0]:
df = spark.read.parquet("/FileStore/ipl_final_cleaned.parquet")
df.show(3)


+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------+-----------+
|year|        series_type|         series_name|match_no|match_type|          match_name|          match_href|        match_team1|         match_team2|match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------+-----------+
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|MUMBAI INDIANS vs...|https://www.cricb...|     MUMBAI INDIANS|ROYAL CHALLENGERS...| 2025-04-09 20:00:00|          null|       null|
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|CHENNAI SUPER KIN...|https://www.cricb...|CHENNAI SUPER KINGS|      DELHI CAPITALS| 2025-04-09 20:00:00|          null|       null|
|2021

In [0]:
from pyspark.sql.functions import col

teams = df.select(col("match_team1")).union(df.select(col("match_team2"))).distinct()
print("Number of unique teams:", teams.count())


Number of unique teams: 8


In [0]:
df.select("match_name").where(col("match_name").contains("Super Over") | col("match_name").contains("Tie")).distinct().show(truncate=False)


+----------+
|match_name|
+----------+
+----------+



In [0]:
df.select("match_name").where(col("match_name").contains("Super Over") | col("match_name").contains("Tie")).distinct().show(truncate=False)


+----------+
|match_name|
+----------+
+----------+

