In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.window import Window
# Initialize SparkSession
spark = SparkSession.builder.appName("PurchasesTable").getOrCreate()

# Define schema
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("purchase_date", TimestampType(), True)
])

# Prepare data with datetime module
data = [
    (536, 3223, 6, datetime(2022, 1, 11, 12, 33, 44)),
    (827, 3585, 35, datetime(2022, 2, 20, 14, 5, 26)),
    (536, 3223, 5, datetime(2022, 3, 2, 9, 33, 28)),
    (536, 1435, 10, datetime(2022, 3, 2, 8, 40, 0)),
    (827, 2452, 45, datetime(2022, 4, 9, 0, 0, 0))
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show data
df.show(truncate=False)

df.createOrReplaceTempView("products")



+-------+----------+--------+-------------------+
|user_id|product_id|quantity|purchase_date      |
+-------+----------+--------+-------------------+
|536    |3223      |6       |2022-01-11 12:33:44|
|827    |3585      |35      |2022-02-20 14:05:26|
|536    |3223      |5       |2022-03-02 09:33:28|
|536    |1435      |10      |2022-03-02 08:40:00|
|827    |2452      |45      |2022-04-09 00:00:00|
+-------+----------+--------+-------------------+



In [8]:
spark.sql("""
    select user_id, product_id, count(distinct date(purchase_date)) from products
    group by user_id, product_id
    having count(distinct date(purchase_date)) >= 2
""").show()

+-------+----------+-----------------------------+
|user_id|product_id|count(DISTINCT purchase_date)|
+-------+----------+-----------------------------+
|    536|      3223|                            2|
+-------+----------+-----------------------------+

