In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

# Define schema

spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

customer_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("visited_on", StringType(), True),  # Using StringType first, will cast to DateType
    StructField("amount", IntegerType(), True)
])

# Create DataFrame
customer_data = [
    (1, "Jhon", "2019-01-01", 100),
    (2, "Daniel", "2019-01-02", 110),
    (3, "Jade", "2019-01-03", 120),
    (4, "Khaled", "2019-01-04", 130),
    (5, "Winston", "2019-01-05", 110),
    (6, "Elvis", "2019-01-06", 140),
    (7, "Anna", "2019-01-07", 150),
    (8, "Maria", "2019-01-08", 80),
    (9, "Jaze", "2019-01-09", 110),
    (1, "Jhon", "2019-01-10", 130),
    (3, "Jade", "2019-01-10", 150)
]

customer_df = spark.createDataFrame(customer_data, schema=customer_schema)

# Convert 'visited_on' column to DateType
# customer_df = customer_df.withColumn("visited_on", customer_df.visited_on.cast(DateType()))

# Create a temporary view for SQL queries
customer_df.createOrReplaceTempView("Customer")

# Create a persistent table (Delta format)
print("Customer table and view created successfully.")


Customer table and view created successfully.


In [30]:
spark.sql("""
    WITH temp AS (
        SELECT visited_on, SUM(amount) AS amount
        FROM Customer
        GROUP BY visited_on
    )
    select visited_on, 
        SUM(amount) OVER (ORDER BY visited_on ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS amount, 
        ROUND(AVG(amount) OVER (ORDER BY visited_on ROWS BETWEEN 6 PRECEDING AND CURRENT ROW), 2) as average_amount
    from temp
    """).show()

+----------+------+--------------+
|visited_on|amount|average_amount|
+----------+------+--------------+
|2019-01-01|   100|         100.0|
|2019-01-02|   210|         105.0|
|2019-01-03|   330|         110.0|
|2019-01-04|   460|         115.0|
|2019-01-05|   570|         114.0|
|2019-01-06|   710|        118.33|
|2019-01-07|   860|        122.86|
|2019-01-08|   840|         120.0|
|2019-01-09|   840|         120.0|
|2019-01-10|  1000|        142.86|
+----------+------+--------------+



In [42]:
from pyspark.sql.functions import *
from pyspark.sql import Window
dedup_cust  = customer_df.groupBy(col("visited_on")).agg(
    sum(col("amount")).alias("amount")
)

In [49]:
windowSpec = Window.orderBy(col("visited_on")).rowsBetween(-6,Window.currentRow)
dedup_cust.select(
      sum(col("amount")).over(windowSpec).alias("rolling_avg")
).show()

+-----------+
|rolling_avg|
+-----------+
|        100|
|        210|
|        330|
|        460|
|        570|
|        710|
|        860|
|        840|
|        840|
|       1000|
+-----------+

