In [None]:
https://www.youtube.com/watch?v=MpAMjtvarrc&list=PLBTZqjSKn0IeKBQDjLmzisazhqQy4iGkb&index=2

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType

spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schema for customer_orders table
customer_orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("order_amount", IntegerType(), True)
])

# Create DataFrame with initial data
customer_orders_data = [
    (1, 100, "2022-01-01", 2000),
    (2, 200, "2022-01-01", 2500),
    (3, 300, "2022-01-01", 2100),
    (4, 100, "2022-01-02", 2000),
    (5, 400, "2022-01-02", 2200),
    (6, 500, "2022-01-02", 2700),
    (7, 100, "2022-01-03", 3000),
    (8, 400, "2022-01-03", 1000),
    (9, 600, "2022-01-03", 3000)
]

# Create DataFrame
customer_orders_df = spark.createDataFrame(customer_orders_data, schema=customer_orders_schema)

# Convert order_date column to DateType
customer_orders_df = customer_orders_df.withColumn("order_date", customer_orders_df.order_date.cast(DateType()))

# Create a temporary view for SQL queries
customer_orders_df.createOrReplaceTempView("customer_orders")

print("customer_orders table and view created successfully.")


customer_orders table and view created successfully.


In [26]:
spark.sql("""
    with first_visit as (select customer_id, order_date,
    min(order_date) over(partition by customer_id) as visit_date
    from customer_orders),
    
    flag as (select 
        order_date, customer_id, 
        CASE WHEN order_date = visit_date THEN 1 ELSE 0 END as first_visit_flag,
        CASE WHEN order_date != visit_date THEN 1 ELSE 0 END as repeat_visit_flag
    from first_visit)
    
    select 
    order_date,
    SUM(first_visit_flag) as first_visit, 
    SUM(repeat_visit_flag) as repeat_visit
    from flag
    group by order_date
""").show()

+----------+-----------+------------+
|order_date|first_visit|repeat_visit|
+----------+-----------+------------+
|2022-01-03|          1|           2|
|2022-01-01|          3|           0|
|2022-01-02|          2|           1|
+----------+-----------+------------+



In [33]:
from pyspark.sql.functions import *
from pyspark.sql import Window

window_spec = Window.partitionBy(col("customer_id")).orderBy(col("order_date"))
first_visit = customer_orders_df.select(
                    "customer_id", "order_date",
                     min(col("order_date")).over(window_spec).alias("first_visit_date")
                )

flag_df = first_visit.select(
        "order_date", "customer_id",
         when(col("order_date") == col("first_visit_date"), 1).otherwise(0).alias("first_visit_flag"),
         when(col("order_date") != col("first_visit_date"), 1).otherwise(0).alias("repeat_visit_flag")

)

In [36]:
flag_df.groupBy(col("order_date")).agg(
        sum(col("first_visit_flag")),
        sum(col("repeat_visit_flag"))
).show()

+----------+---------------------+----------------------+
|order_date|sum(first_visit_flag)|sum(repeat_visit_flag)|
+----------+---------------------+----------------------+
|2022-01-03|                    1|                     2|
|2022-01-01|                    3|                     0|
|2022-01-02|                    2|                     1|
+----------+---------------------+----------------------+

