In [0]:
ipl_df = spark.read.option("header", True).csv("/FileStore/tables/ipl_2021_matches.csv")



In [0]:
from pyspark.sql.types import *

schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("series_type", TimestampType(), True),
    StructField("series_name", StringType(), True),
    StructField("match_no", StringType(), True),
    StructField("match_type", StringType(), True),
    StructField("match_name", StringType(), True),
    StructField("match_href", StringType(), True),
    StructField("match_team1", StringType(), True),
    StructField("match_team2", StringType(), True),
    StructField("match_datetime_start", StringType(), True),
    StructField("match_date_end", StringType(), True),
    StructField("match_venue", StringType(), True)
])

ipl_cleaned_df = spark.read.option("header", True).schema(schema).csv("/FileStore/tables/ipl_2021_matches.csv")


In [0]:
ipl_cleaned_df.write.mode("overwrite").parquet("/FileStore/tables/ipl_cleaned.parquet")


In [0]:
ipl_cleaned_df = spark.read.parquet("/FileStore/tables/ipl_cleaned.parquet")
ipl_cleaned_df.show(5)


+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|year|        series_type|         series_name|match_no|match_type|          match_name|          match_href|match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|MUMBAI INDIANS vs...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|CHENNAI SUPER KIN...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|SUNRISERS HYDERAB...|ht

In [0]:
selected_df = ipl_cleaned_df.select("match_team1", "match_team2", "match_venue")


In [0]:
selected_df.show(5)


+-----------+-----------+-----------+
|match_team1|match_team2|match_venue|
+-----------+-----------+-----------+
|       null|       null|       null|
|       null|       null|       null|
|       null|       null|       null|
|       null|       null|       null|
|       null|       null|       null|
+-----------+-----------+-----------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import split, trim

ipl_cleaned_df = ipl_cleaned_df.withColumn("match_team1", trim(split("match_name", " vs ")[0]))

ipl_cleaned_df = ipl_cleaned_df.withColumn(
    "match_team2",
    trim(split(split("match_name", " vs ")[1], ",")[0])
)


In [0]:
ipl_cleaned_df.select("match_team1", "match_team2", "match_name").show(5, truncate=False)


+---------------------+---------------------------+--------------------------------------------------------+
|match_team1          |match_team2                |match_name                                              |
+---------------------+---------------------------+--------------------------------------------------------+
|MUMBAI INDIANS       |ROYAL CHALLENGERS BENGALURU|MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match|
|CHENNAI SUPER KINGS  |DELHI CAPITALS             |CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match        |
|SUNRISERS HYDERABAD  |KOLKATA KNIGHT RIDERS      |SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, 3rd Match |
|RAJASTHAN ROYALS     |PUNJAB KINGS               |RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match             |
|KOLKATA KNIGHT RIDERS|MUMBAI INDIANS             |KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th Match      |
+---------------------+---------------------------+--------------------------------------------------------+
only showing top 5 

In [0]:
ipl_cleaned_df = ipl_cleaned_df.fillna({"match_venue": "To Be Updated"})


In [0]:
ipl_cleaned_df = ipl_cleaned_df.fillna({"match_venue": "To Be Updated"})


In [0]:
ipl_cleaned_df.select("match_team1", "match_team2", "match_venue").show(5)


+--------------------+--------------------+-------------+
|         match_team1|         match_team2|  match_venue|
+--------------------+--------------------+-------------+
|      MUMBAI INDIANS|ROYAL CHALLENGERS...|To Be Updated|
| CHENNAI SUPER KINGS|      DELHI CAPITALS|To Be Updated|
| SUNRISERS HYDERABAD|KOLKATA KNIGHT RI...|To Be Updated|
|    RAJASTHAN ROYALS|        PUNJAB KINGS|To Be Updated|
|KOLKATA KNIGHT RI...|      MUMBAI INDIANS|To Be Updated|
+--------------------+--------------------+-------------+
only showing top 5 rows



In [0]:
display(ipl_cleaned_df.select("match_team1", "match_team2", "match_venue"))


match_team1,match_team2,match_venue
MUMBAI INDIANS,ROYAL CHALLENGERS BENGALURU,To Be Updated
CHENNAI SUPER KINGS,DELHI CAPITALS,To Be Updated
SUNRISERS HYDERABAD,KOLKATA KNIGHT RIDERS,To Be Updated
RAJASTHAN ROYALS,PUNJAB KINGS,To Be Updated
KOLKATA KNIGHT RIDERS,MUMBAI INDIANS,To Be Updated
SUNRISERS HYDERABAD,ROYAL CHALLENGERS BENGALURU,To Be Updated
RAJASTHAN ROYALS,DELHI CAPITALS,To Be Updated
PUNJAB KINGS,CHENNAI SUPER KINGS,To Be Updated
MUMBAI INDIANS,SUNRISERS HYDERABAD,To Be Updated
ROYAL CHALLENGERS BENGALURU,KOLKATA KNIGHT RIDERS,To Be Updated


In [0]:
ipl_cleaned_df.explain("formatted")


== Physical Plan ==
* Project (3)
+- * ColumnarToRow (2)
   +- Scan parquet  (1)


(1) Scan parquet 
Output [10]: [year#3575, series_type#3576, series_name#3577, match_no#3578, match_type#3579, match_name#3580, match_href#3581, match_datetime_start#3584, match_date_end#3585, match_venue#3586]
Batched: true
Location: InMemoryFileIndex [dbfs:/FileStore/tables/ipl_cleaned.parquet]
ReadSchema: struct<year:int,series_type:timestamp,series_name:string,match_no:string,match_type:string,match_name:string,match_href:string,match_datetime_start:string,match_date_end:string,match_venue:string>

(2) ColumnarToRow [codegen id : 1]
Input [10]: [year#3575, series_type#3576, series_name#3577, match_no#3578, match_type#3579, match_name#3580, match_href#3581, match_datetime_start#3584, match_date_end#3585, match_venue#3586]

(3) Project [codegen id : 1]
Output [12]: [year#3575, series_type#3576, series_name#3577, match_no#3578, match_type#3579, match_name#3580, match_href#3581, trim(split(match_name#358

In [0]:
ipl_cleaned_df.select("match_team1", "match_team2").explain("extended")


== Parsed Logical Plan ==
'Project ['match_team1, 'match_team2]
+- Project [year#3575, series_type#3576, series_name#3577, match_no#3578, match_type#3579, match_name#3580, match_href#3581, match_team1#3695, match_team2#3708, match_datetime_start#3584, match_date_end#3585, coalesce(match_venue#3751, cast(To Be Updated as string)) AS match_venue#3776]
   +- Project [year#3575, series_type#3576, series_name#3577, match_no#3578, match_type#3579, match_name#3580, match_href#3581, match_team1#3695, match_team2#3708, match_datetime_start#3584, match_date_end#3585, coalesce(match_venue#3586, cast(To Be Updated as string)) AS match_venue#3751]
      +- Project [year#3575, series_type#3576, series_name#3577, match_no#3578, match_type#3579, match_name#3580, match_href#3581, match_team1#3695, trim(split(split(match_name#3580,  vs , -1)[1], ,, -1)[0], None) AS match_team2#3708, match_datetime_start#3584, match_date_end#3585, match_venue#3586]
         +- Project [year#3575, series_type#3576, series