In [1]:
from pyspark.sql.types import StructType, StructField, StringType, DateType
from datetime import datetime
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Raw string data
records_data = [
    ("2021-01-01", "A1", "U1"),
    ("2021-01-01", "A1", "U2"),
    ("2021-01-06", "A1", "U3"),
    ("2021-01-02", "A1", "U1"),
    ("2020-12-24", "A1", "U2"),
    ("2020-12-08", "A1", "U1"),
    ("2020-12-09", "A1", "U1"),
    ("2021-01-10", "A2", "U4"),
    ("2021-01-11", "A2", "U4"),
    ("2021-01-12", "A2", "U4"),
    ("2021-01-15", "A2", "U5"),
    ("2020-12-17", "A2", "U4"),
    ("2020-12-25", "A3", "U6"),
    ("2020-12-25", "A3", "U6"),
    ("2020-12-25", "A3", "U6"),
    ("2020-12-06", "A3", "U7"),
    ("2020-12-06", "A3", "U6"),
    ("2021-01-14", "A3", "U6"),
    ("2021-02-07", "A1", "U1"),
    ("2021-02-10", "A1", "U2"),
    ("2021-02-01", "A2", "U4"),
    ("2021-02-01", "A2", "U5"),
    ("2020-12-05", "A1", "U8"),
]

# Convert string to datetime.date
records_data_typed = [
    (datetime.strptime(date_str, "%Y-%m-%d").date(), account_id, user_id)
    for date_str, account_id, user_id in records_data
]

# Define schema
schema = StructType([
    StructField("record_date", DateType(), False),
    StructField("account_id", StringType(), False),
    StructField("user_id", StringType(), False),
])

# Create DataFrame
records_df = spark.createDataFrame(records_data_typed, schema)

# Register as temporary view
records_df.createOrReplaceTempView("account_records")


In [9]:
spark.sql("""
    with ranked as (
    select *, 
    row_number() over(partition by user_id order by record_date) rn
    from account_records),
    
    grp as (select user_id, record_date,
    DATE_SUB(record_date, rn-1) as grp_date
    from ranked)
    
    select user_id, grp_date, count(*) from grp group by user_id, grp_date having count(*) = 3
""").show()

+-------+----------+--------+
|user_id|  grp_date|count(1)|
+-------+----------+--------+
|     U4|2021-01-09|       3|
+-------+----------+--------+



In [18]:
spark.sql("""
WITH consecutive_days AS
  (SELECT user_id,
          record_date,
          LAG(record_date, 1) OVER (PARTITION BY user_id
                                    ORDER BY record_date) AS prev_day,
          LEAD(record_date, 1) OVER (PARTITION BY user_id
                                     ORDER BY record_date) AS next_day
   FROM account_records)
SELECT DISTINCT user_id
FROM consecutive_days
WHERE DATEDIFF(record_date, prev_day) = 1
  AND DATEDIFF(next_day, record_date) = 1;
""").show()

+-------+
|user_id|
+-------+
|     U4|
+-------+



In [20]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

window_spec = Window.partitionBy("user_id").orderBy("record_date")
ranked_df = records_df.withColumn("rn", F.row_number().over(window_spec))
grp_df = ranked_df.withColumn(
    "grp_date",
    F.date_sub(F.col("record_date"), F.col("rn") - F.lit(1))
)
result_df = grp_df.groupBy("user_id", "grp_date") \
    .agg(F.count("*").alias("cnt")) \
    .filter(F.col("cnt") == 3) \
    .select("user_id", "grp_date", "cnt")

result_df.show()


+-------+----------+---+
|user_id|  grp_date|cnt|
+-------+----------+---+
|     U4|2021-01-09|  3|
+-------+----------+---+

