In [1]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define data
activity_data = [
    (1, 2, '2016-03-01', 5),
    (1, 2, '2016-03-02', 6),
    (2, 3, '2017-06-25', 1),
    (3, 1, '2016-03-02', 0),
    (3, 4, '2018-07-03', 5),
]

# Convert string dates to date type
activity_data_typed = [
    (player_id, device_id, datetime.strptime(event_date, "%Y-%m-%d").date(), games_played)
    for player_id, device_id, event_date, games_played in activity_data
]

# Define schema
activity_schema = StructType([
    StructField("player_id", IntegerType(), False),
    StructField("device_id", IntegerType(), False),
    StructField("event_date", DateType(), False),
    StructField("games_played", IntegerType(), False),
])

# Create DataFrame and register view
activity_df = spark.createDataFrame(activity_data_typed, activity_schema)
activity_df.createOrReplaceTempView("Activity")


In [2]:
spark.sql("""
    with cte as (
    select *,
    row_number() over(partition by player_id order by event_date) as rn,
    lead(event_date) over(partition by player_id order by event_date) as next_date
    from Activity)
    
    select
    SUM(CASE WHEN DATEDIFF(next_date, event_date) = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(DISTINCT player_id)
    from cte
    where rn = 1
    
    
""").show()

+------------------------------------------------------------------------------------------------------------+
|((sum(CASE WHEN (datediff(next_date, event_date) = 1) THEN 1 ELSE 0 END) * 1.0) / count(DISTINCT player_id))|
+------------------------------------------------------------------------------------------------------------+
|                                                                                          0.3333333333333333|
+------------------------------------------------------------------------------------------------------------+



In [3]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, lead, datediff, sum as spark_sum, countDistinct, round

window_spec = Window.partitionBy('player_id').orderBy('event_date')

In [4]:
activity_with_flags = activity_df.withColumn('rn', row_number().over(window_spec)) \
            .withColumn('next_date', lead(col('event_date')).over(window_spec))
first_login = activity_with_flags.filter(col("rn") == 1)

In [5]:
result = first_login.select(
    round(
        spark_sum((datediff("next_date", "event_date") == 1).cast("int")) * 1.0
        / countDistinct("player_id"),
        2
    ).alias("fraction")
)

In [6]:
result.show()

+--------+
|fraction|
+--------+
|    0.33|
+--------+

