In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number, count
from pyspark.sql.window import Window

# Step 1: Create Spark session
spark = SparkSession.builder.getOrCreate()

In [0]:
# Step 2: Sample data
data = [
    (1, '2017-01-01', 10),
    (2, '2017-01-02', 109),
    (3, '2017-01-03', 150),
    (4, '2017-01-04', 99),
    (5, '2017-01-05', 145),
    (6, '2017-01-06', 1455),
    (7, '2017-01-07', 199),
    (8, '2017-01-09', 188)
]

columns = ['id', 'visit_date', 'people']
df = spark.createDataFrame(data, columns)

display(df)

In [0]:
# Step 3: Filter people >= 100
filtered_df = df.filter(col('people') >= 100)
display(filtered_df)

In [0]:
# Step 4: Add row number and compute the grouping key
ranked_df = filtered_df.withColumn('rn', row_number().over(window=Window.orderBy(col('id')))) \
            .withColumn('group_id',col('id')-col('rn'))
display(ranked_df)

In [0]:
# Step 5: Group by group_id and find groups with >= 3 rows
group_counts = ranked_df.groupBy("group_id").agg(count("*").alias("cnt"))
valid_groups = group_counts.filter(col("cnt") >= 3).select("group_id")

# Step 6: Join back to get valid records
result_df = ranked_df.join(valid_groups, on='group_id', how='inner') \
                    .select("id", "visit_date", "people") \
                    .orderBy(col("visit_date"))

display(result_df)