In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define data
activity_data = [
    (1, 2, '2016-03-01', 5),
    (1, 2, '2016-03-02', 6),
    (2, 3, '2017-06-25', 1),
    (3, 1, '2016-03-02', 0),
    (3, 4, '2018-07-03', 5),
]

# Convert string dates to date type
activity_data_typed = [
    (player_id, device_id, datetime.strptime(event_date, "%Y-%m-%d").date(), games_played)
    for player_id, device_id, event_date, games_played in activity_data
]

# Define schema
activity_schema = StructType([
    StructField("player_id", IntegerType(), False),
    StructField("device_id", IntegerType(), False),
    StructField("event_date", DateType(), False),
    StructField("games_played", IntegerType(), False),
])

# Create DataFrame and register view
activity_df = spark.createDataFrame(activity_data_typed, activity_schema)
activity_df.createOrReplaceTempView("Activity")


In [8]:
spark.sql("""
    with cte as (
    select *,
    min(event_date) over(partition by player_id order by event_date) as min_date,
    max(games_played) over(partition by player_id) as max_games_played
    from Activity)
    
    select * from cte where event_date = min_date and games_played = max_games_played
""").show()

+---------+---------+----------+------------+----------+----------------+
|player_id|device_id|event_date|games_played|  min_date|max_games_played|
+---------+---------+----------+------------+----------+----------------+
|        2|        3|2017-06-25|           1|2017-06-25|               1|
+---------+---------+----------+------------+----------+----------------+



In [12]:
spark.sql(
    """
        select * from Activity
        where (player_id, event_date) IN (
            select player_id, min(event_date) from Activity 
            group by player_id
        )
        AND (player_id, games_played) IN (
            SELECT player_id, MAX(games_played)
            FROM Activity
            GROUP BY player_id
);
    """
).show()

+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
|        2|        3|2017-06-25|           1|
+---------+---------+----------+------------+

