In [0]:

df = spark.read.option("header", True).csv("/FileStore/tables/ipl_2021_matches.csv")
df.show(5)



+----+-----------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type|         series_name|match_no|match_type|          match_name|          match_href|match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-----------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|2021|        T20|Indian Premier Le...|    null|    League|MUMBAI INDIANS vs...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|        T20|Indian Premier Le...|    null|    League|CHENNAI SUPER KIN...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|        T20|Indian Premier Le...|    null|    League|SUNRISERS HYDERAB...|https://www.cricb...|       null|       null|     

In [0]:

ipl_df_string = spark.read.option("header", True).csv("/FileStore/tables/ipl_2021_matches.csv")

ipl_df_string.printSchema()
ipl_df_string.show(5, truncate=False)


root
 |-- year: string (nullable = true)
 |-- series_type: string (nullable = true)
 |-- series_name: string (nullable = true)
 |-- match_no: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- match_name: string (nullable = true)
 |-- match_href: string (nullable = true)
 |-- match_team1: string (nullable = true)
 |-- match_team2: string (nullable = true)
 |-- match_datetime_start: string (nullable = true)
 |-- match_date_end: string (nullable = true)
 |-- match_venue: string (nullable = true)

+----+-----------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type|series_name               |match_no|match_type|match_name                                              |match_href                                                          

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

ipl_schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("series_type", TimestampType(), True),
    StructField("series_name", StringType(), True),
    StructField("match_no", StringType(), True),
    StructField("match_type", StringType(), True),
    StructField("match_name", StringType(), True),
    StructField("match_href", StringType(), True),
    StructField("match_team1", StringType(), True),
    StructField("match_team2", StringType(), True),
    StructField("match_datetime_start", StringType(), True),
    StructField("match_date_end", StringType(), True),
    StructField("match_venue", StringType(), True)
])

ipl_df_struct = spark.read.option("header", True).schema(ipl_schema).csv("/FileStore/tables/ipl_2021_matches.csv")

ipl_df_struct.printSchema()
ipl_df_struct.show(5, truncate=False)


root
 |-- year: integer (nullable = true)
 |-- series_type: timestamp (nullable = true)
 |-- series_name: string (nullable = true)
 |-- match_no: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- match_name: string (nullable = true)
 |-- match_href: string (nullable = true)
 |-- match_team1: string (nullable = true)
 |-- match_team2: string (nullable = true)
 |-- match_datetime_start: string (nullable = true)
 |-- match_date_end: string (nullable = true)
 |-- match_venue: string (nullable = true)

+----+-------------------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type        |series_name               |match_no|match_type|match_name                                              |match_href                                      

In [0]:
ipl_cleaned_df = ipl_df_struct.dropna(subset=["match_name", "match_href", "year"])

ipl_cleaned_df.show(5, truncate=False)


+----+-------------------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type        |series_name               |match_no|match_type|match_name                                              |match_href                                                                                   |match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------+-----------+--------------------+--------------+-----------+
|2021|2025-04-09 20:00:00|Indian Premier League 2021|null    |League    |MUMBAI INDIANS vs ROYAL CHALLENGERS 

In [0]:
from pyspark.sql.functions import split, trim

ipl_cleaned_df = ipl_cleaned_df.withColumn("match_team1", trim(split("match_name", " vs ")[0]))
ipl_cleaned_df = ipl_cleaned_df.withColumn("match_team2", trim(split("match_name", " vs ")[1]))

ipl_cleaned_df.select("match_team1", "match_team2", "match_name").show(5, truncate=False)


+---------------------+--------------------------------------+--------------------------------------------------------+
|match_team1          |match_team2                           |match_name                                              |
+---------------------+--------------------------------------+--------------------------------------------------------+
|MUMBAI INDIANS       |ROYAL CHALLENGERS BENGALURU, 1st Match|MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match|
|CHENNAI SUPER KINGS  |DELHI CAPITALS, 2nd Match             |CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match        |
|SUNRISERS HYDERABAD  |KOLKATA KNIGHT RIDERS, 3rd Match      |SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, 3rd Match |
|RAJASTHAN ROYALS     |PUNJAB KINGS, 4th Match               |RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match             |
|KOLKATA KNIGHT RIDERS|MUMBAI INDIANS, 5th Match             |KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th Match      |
+---------------------+-----------------

In [0]:
ipl_cleaned_df = ipl_cleaned_df.dropna(subset=["match_team1", "match_team2"])
ipl_cleaned_df.show(5, truncate=False)


+----+-------------------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+---------------------+--------------------------------------+--------------------+--------------+-----------+
|year|series_type        |series_name               |match_no|match_type|match_name                                              |match_href                                                                                   |match_team1          |match_team2                           |match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+---------------------+--------------------------------------+--------------------+--------------+-----------

In [0]:
ipl_cleaned_df = ipl_cleaned_df.dropna(subset=["match_team1", "match_team2"])
ipl_cleaned_df.show(5, truncate=False)


+----+-------------------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+---------------------+--------------------------------------+--------------------+--------------+-----------+
|year|series_type        |series_name               |match_no|match_type|match_name                                              |match_href                                                                                   |match_team1          |match_team2                           |match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+---------------------+--------------------------------------+--------------------+--------------+-----------

In [0]:
ipl_cleaned_df.explain("simple")
ipl_cleaned_df.explain("formatted")


== Physical Plan ==
*(1) Project [year#3004, series_type#3005, series_name#3006, match_no#3007, match_type#3008, match_name#3009, match_href#3010, trim(split(match_name#3009,  vs , 2)[0], None) AS match_team1#3163, trim(split(match_name#3009,  vs , 3)[1], None) AS match_team2#3176, match_datetime_start#3013, match_date_end#3014, match_venue#3015]
+- *(1) Filter (atleastnnonnulls(3, match_name#3009, match_href#3010, year#3004) AND atleastnnonnulls(2, trim(split(match_name#3009,  vs , 2)[0], None), trim(split(match_name#3009,  vs , 3)[1], None)))
   +- FileScan csv [year#3004,series_type#3005,series_name#3006,match_no#3007,match_type#3008,match_name#3009,match_href#3010,match_datetime_start#3013,match_date_end#3014,match_venue#3015] Batched: false, DataFilters: [atleastnnonnulls(3, match_name#3009, match_href#3010, year#3004), atleastnnonnulls(2, trim(split..., Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/FileStore/tables/ipl_2021_matches.csv], PartitionFilters: [], PushedFilt

In [0]:
ipl_cleaned_df.createOrReplaceTempView("ipl_matches")

spark.sql("""
    SELECT match_team1, COUNT(*) as match_count 
    FROM ipl_matches 
    GROUP BY match_team1 
    ORDER BY match_count DESC
""").show()


+--------------------+-----------+
|         match_team1|match_count|
+--------------------+-----------+
|      DELHI CAPITALS|          9|
| CHENNAI SUPER KINGS|          8|
|ROYAL CHALLENGERS...|          8|
| SUNRISERS HYDERABAD|          8|
|KOLKATA KNIGHT RI...|          8|
|    RAJASTHAN ROYALS|          7|
|      MUMBAI INDIANS|          7|
|        PUNJAB KINGS|          7|
+--------------------+-----------+



In [0]:
ipl_cleaned_df.show(5)


+----+-------------------+--------------------+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+-----------+
|year|        series_type|         series_name|match_no|match_type|          match_name|          match_href|         match_team1|         match_team2|match_datetime_start|match_date_end|match_venue|
+----+-------------------+--------------------+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+-----------+
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|MUMBAI INDIANS vs...|https://www.cricb...|      MUMBAI INDIANS|ROYAL CHALLENGERS...|                null|          null|       null|
|2021|2025-04-09 20:00:00|Indian Premier Le...|    null|    League|CHENNAI SUPER KIN...|https://www.cricb...| CHENNAI SUPER KINGS|DELHI CAPITALS, 2...|                null|          null|       null|


In [0]:

ipl_cleaned_df.explain()

ipl_cleaned_df.explain(mode="extended")

ipl_cleaned_df.explain(mode="codegen")

ipl_cleaned_df.explain(mode="cost")

ipl_cleaned_df.explain(mode="formatted")


== Physical Plan ==
*(1) Project [year#3004, series_type#3005, series_name#3006, match_no#3007, match_type#3008, match_name#3009, match_href#3010, trim(split(match_name#3009,  vs , 2)[0], None) AS match_team1#3163, trim(split(match_name#3009,  vs , 3)[1], None) AS match_team2#3176, match_datetime_start#3013, match_date_end#3014, match_venue#3015]
+- *(1) Filter (atleastnnonnulls(3, match_name#3009, match_href#3010, year#3004) AND atleastnnonnulls(2, trim(split(match_name#3009,  vs , 2)[0], None), trim(split(match_name#3009,  vs , 3)[1], None)))
   +- FileScan csv [year#3004,series_type#3005,series_name#3006,match_no#3007,match_type#3008,match_name#3009,match_href#3010,match_datetime_start#3013,match_date_end#3014,match_venue#3015] Batched: false, DataFilters: [atleastnnonnulls(3, match_name#3009, match_href#3010, year#3004), atleastnnonnulls(2, trim(split..., Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/FileStore/tables/ipl_2021_matches.csv], PartitionFilters: [], PushedFilt

In [0]:
pandas_df = ipl_cleaned_df.limit(5).toPandas()
print(pandas_df)


   year         series_type                 series_name match_no match_type  \
0  2021 2025-04-09 20:00:00  Indian Premier League 2021     None     League   
1  2021 2025-04-09 20:00:00  Indian Premier League 2021     None     League   
2  2021 2025-04-09 20:00:00  Indian Premier League 2021     None     League   
3  2021 2025-04-09 20:00:00  Indian Premier League 2021     None     League   
4  2021 2025-04-09 20:00:00  Indian Premier League 2021     None     League   

                                          match_name  \
0  MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU,...   
1   CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match   
2  SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, ...   
3        RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match   
4  KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th M...   

                                          match_href            match_team1  \
0  https://www.cricbuzz.com/cricket-scores/35612/...         MUMBAI INDIANS   
1  https://www.cricbuzz.com/cr

In [0]:
ipl_cleaned_df.explain("formatted")


== Physical Plan ==
* Project (3)
+- * Filter (2)
   +- Scan csv  (1)


(1) Scan csv 
Output [10]: [year#3004, series_type#3005, series_name#3006, match_no#3007, match_type#3008, match_name#3009, match_href#3010, match_datetime_start#3013, match_date_end#3014, match_venue#3015]
Batched: false
Location: InMemoryFileIndex [dbfs:/FileStore/tables/ipl_2021_matches.csv]
ReadSchema: struct<year:int,series_type:timestamp,series_name:string,match_no:string,match_type:string,match_name:string,match_href:string,match_datetime_start:string,match_date_end:string,match_venue:string>

(2) Filter [codegen id : 1]
Input [10]: [year#3004, series_type#3005, series_name#3006, match_no#3007, match_type#3008, match_name#3009, match_href#3010, match_datetime_start#3013, match_date_end#3014, match_venue#3015]
Condition : (atleastnnonnulls(3, match_name#3009, match_href#3010, year#3004) AND atleastnnonnulls(2, trim(split(match_name#3009,  vs , 2)[0], None), trim(split(match_name#3009,  vs , 3)[1], None)))

(3