In [None]:
https://www.youtube.com/watch?v=qyAgWL066Vo&list=PLBTZqjSKn0IeKBQDjLmzisazhqQy4iGkb

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Define schema for icc_world_cup table
icc_schema = StructType([
    StructField("team1", StringType(), True),
    StructField("team2", StringType(), True),
    StructField("winner", StringType(), True)
])

# Create DataFrame with initial data
icc_data = [
    ("India", "SL", "India"),
    ("SL", "Aus", "Aus"),
    ("SA", "Eng", "Eng"),
    ("Eng", "NZ", "NZ"),
    ("Aus", "India", "India")
]

icc_df = spark.createDataFrame(icc_data, schema=icc_schema)

# Create a temporary view for SQL queries
icc_df.createOrReplaceTempView("data")

# Create a persistent table (Delta format)
print("icc_world_cup table and view created successfully.")


icc_world_cup table and view created successfully.


In [10]:
spark.sql("""
    select team1 as team_name,
    case when team1=winner then 1 else 0 end as win_flag
    from data""")

DataFrame[team_name: string, win_flag: int]

In [17]:
spark.sql("""
    with cte as (select team1 as team_name,
    case when team1=winner then 1 else 0 end as win_flag
    from data
    union all
    select team2 as team_name,
    case when team2=winner then 1 else 0  end as win_flag
    from data)
    
    select team_name, 
           count(*) as no_of_matches_played,
           sum(win_flag) as no_of_wins,
           count(*) - sum(win_flag) as no_of_losses
    from cte group by team_name
""").show()


+---------+--------------------+----------+------------+
|team_name|no_of_matches_played|no_of_wins|no_of_losses|
+---------+--------------------+----------+------------+
|    India|                   2|         2|           0|
|       SL|                   2|         0|           2|
|       SA|                   1|         0|           1|
|      Eng|                   2|         1|           1|
|      Aus|                   2|         1|           1|
|       NZ|                   1|         1|           0|
+---------+--------------------+----------+------------+



In [21]:
from pyspark.sql.functions import *

cte_df = icc_df.select(
    col("team1").alias("team_name"),
    when(col("team1") == col("winner"), 1).otherwise(0).alias("win_flag")
).unionAll(
    icc_df.select(
        col("team2").alias("team_name"),
        when(col("team2") == col("winner"), 1).otherwise(0).alias("win_flag")
    )
)


In [24]:
cte_df.groupBy(col("team_name")).agg(
        count("*"),
        sum("win_flag"),
        count("*") - sum("win_flag")
).show()

+---------+--------+-------------+--------------------------+
|team_name|count(1)|sum(win_flag)|(count(1) - sum(win_flag))|
+---------+--------+-------------+--------------------------+
|    India|       2|            2|                         0|
|       SL|       2|            0|                         2|
|       SA|       1|            0|                         1|
|      Eng|       2|            1|                         1|
|      Aus|       2|            1|                         1|
|       NZ|       1|            1|                         0|
+---------+--------+-------------+--------------------------+

