In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pathlib import Path

spark = SparkSession.builder.appName("ExampleApp").master("local[*]").getOrCreate()

user_df = spark.read.csv("users.csv", header=True, inferSchema=True)
transaction_df = spark.read.csv("transactions.csv", header=True, inferSchema=True)  

transaction_df = transaction_df.withColumn("timestamp", F.to_date("timestamp", "yyyy-MM-dd"))

joined_df = user_df.join(transaction_df, on = "user_id", how = "inner")

windowDriver = Window.partitionBy("user_id").orderBy(F.asc("timestamp"))

joined_df = joined_df.withColumn("prev_date", F.lag("timestamp", 1).over(windowDriver))
joined_df = joined_df.withColumn("date_diff", F.dateDiff(F.col("timestamp"), F.col("prev_date")))

result_df = joined_df.filter("date_diff > 30")

result_df.coalesce(1).write.parquet("output/date_diff.parquet", mode="overwrite")
result_df.show(5, truncate=False)