In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeTable").getOrCreate()

# Create data as a list of tuples
data = [
    (1, '100m', 2016, 'Amthhew Mcgarray', 'donald', 'barbara'),
    (2, '200m', 2016, 'Nichole', 'Alvaro Eaton', 'janet Smith'),
    (3, '500m', 2016, 'Charles', 'Nichole', 'Susana'),
    (4, '100m', 2016, 'Ronald', 'maria', 'paula'),
    (5, '200m', 2016, 'Alfred', 'carol', 'Steven'),
    (6, '500m', 2016, 'Nichole', 'Alfred', 'Brandon'),
    (7, '100m', 2016, 'Charles', 'Dennis', 'Susana'),
    (8, '200m', 2016, 'Thomas', 'Dawn', 'catherine'),
    (9, '500m', 2016, 'Thomas', 'Dennis', 'paula'),
    (10, '100m', 2016, 'Charles', 'Dennis', 'Susana'),
    (11, '200m', 2016, 'jessica', 'Donald', 'Stefeney'),
    (12, '500m', 2016, 'Thomas', 'Steven', 'Catherine'),
]

# Define schema
columns = ["ID", "event", "YEAR", "GOLD", "SILVER", "BRONZE"]

# Create DataFrame
events_df = spark.createDataFrame(data, schema=columns)

# Create or replace a temporary view
events_df.createOrReplaceTempView("events")

# Now you can run SQL queries like this
spark.sql("SELECT * FROM events").show()


+---+-----+----+----------------+------------+-----------+
| ID|event|YEAR|            GOLD|      SILVER|     BRONZE|
+---+-----+----+----------------+------------+-----------+
|  1| 100m|2016|Amthhew Mcgarray|      donald|    barbara|
|  2| 200m|2016|         Nichole|Alvaro Eaton|janet Smith|
|  3| 500m|2016|         Charles|     Nichole|     Susana|
|  4| 100m|2016|          Ronald|       maria|      paula|
|  5| 200m|2016|          Alfred|       carol|     Steven|
|  6| 500m|2016|         Nichole|      Alfred|    Brandon|
|  7| 100m|2016|         Charles|      Dennis|     Susana|
|  8| 200m|2016|          Thomas|        Dawn|  catherine|
|  9| 500m|2016|          Thomas|      Dennis|      paula|
| 10| 100m|2016|         Charles|      Dennis|     Susana|
| 11| 200m|2016|         jessica|      Donald|   Stefeney|
| 12| 500m|2016|          Thomas|      Steven|  Catherine|
+---+-----+----+----------------+------------+-----------+



In [2]:
#Write a query to find no of gold medal per swimmer for swimmer who won only gold medals

In [10]:
spark.sql("""
    select gold as player_name, count(*) as no_of_medals from events
    where gold not in (select silver from events union all select bronze from events)
    group by gold
""").show()

+----------------+------------+
|     player_name|no_of_medals|
+----------------+------------+
|Amthhew Mcgarray|           1|
|         Charles|           3|
|          Ronald|           1|
|          Thomas|           3|
|         jessica|           1|
+----------------+------------+



In [25]:
#pyspark

silver_bronze = events_df.select(col("silver").alias("silver_bronze")).unionAll(events_df.select(col("bronze").alias("silver_bronze")))
events_df.filter(~col("gold").isin([row["silver_bronze"] for row in silver_bronze.collect()])).show()

+---+-----+----+----------------+-------+---------+
| ID|event|YEAR|            GOLD| SILVER|   BRONZE|
+---+-----+----+----------------+-------+---------+
|  1| 100m|2016|Amthhew Mcgarray| donald|  barbara|
|  3| 500m|2016|         Charles|Nichole|   Susana|
|  4| 100m|2016|          Ronald|  maria|    paula|
|  7| 100m|2016|         Charles| Dennis|   Susana|
|  8| 200m|2016|          Thomas|   Dawn|catherine|
|  9| 500m|2016|          Thomas| Dennis|    paula|
| 10| 100m|2016|         Charles| Dennis|   Susana|
| 11| 200m|2016|         jessica| Donald| Stefeney|
| 12| 500m|2016|          Thomas| Steven|Catherine|
+---+-----+----+----------------+-------+---------+



In [27]:
events_df.alias("e").join(silver_bronze.alias("sb"), col("e.gold") == col("sb.silver_bronze"), how = "anti").show()

+---+-----+----+----------------+-------+---------+
| ID|event|YEAR|            GOLD| SILVER|   BRONZE|
+---+-----+----+----------------+-------+---------+
|  1| 100m|2016|Amthhew Mcgarray| donald|  barbara|
|  3| 500m|2016|         Charles|Nichole|   Susana|
|  4| 100m|2016|          Ronald|  maria|    paula|
|  7| 100m|2016|         Charles| Dennis|   Susana|
|  8| 200m|2016|          Thomas|   Dawn|catherine|
|  9| 500m|2016|          Thomas| Dennis|    paula|
| 10| 100m|2016|         Charles| Dennis|   Susana|
| 11| 200m|2016|         jessica| Donald| Stefeney|
| 12| 500m|2016|          Thomas| Steven|Catherine|
+---+-----+----+----------------+-------+---------+



In [7]:
spark.sql("""
    with cte as (
    select gold as player_name, "gold" as medal_type from events union all
    select silver as player_name, "silver" as medal_type from events union all
    select bronze as player_name, "bronze" as medal_type from events)
    
    select player_name, count(*) as no_of_gold from cte
    group by player_name
    having count(distinct medal_type) = 1 and max(medal_type) = 'gold'
""").show()

+----------------+----------+
|     player_name|no_of_gold|
+----------------+----------+
|Amthhew Mcgarray|         1|
|         Charles|         3|
|          Ronald|         1|
|          Thomas|         3|
|         jessica|         1|
+----------------+----------+

