### Task 2 â€” Aggregation + Top N Per Group (Window Functions)
Scenario

**Transactions dataset:**
```
transaction_id
user_id
transaction_amount
transaction_date
country
```

**Requirements**

- Calculate total daily revenue per country.

- For each country and day, return the top 3 users by revenue.

**Return:**

```
country

transaction_date

user_id

total_user_revenue

rank
```

In [0]:
import builtins
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from datetime import datetime, timedelta
import random

spark = SparkSession.builder.appName("TopUsersPerDay").getOrCreate()

countries = ["US", "UK", "CA"]
num_users = 10
num_days = 5

data = []
start_date = datetime(2026, 2, 10)

for day in range(num_days):
    transaction_date = (start_date + timedelta(days=day)).date()
    for country in countries:
        for user_id in range(1, num_users + 1):
            num_transactions = random.randint(1, 5)
            for tx_index in range(num_transactions):
                amount = builtins.round(random.uniform(10, 500), 2)  
                data.append((
                    f"t{day}_{country}_{user_id}_{tx_index}",
                    user_id,
                    amount,
                    transaction_date,
                    country
                ))

schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("transaction_amount", DoubleType(), True),
    StructField("transaction_date", DateType(), True),
    StructField("country", StringType(), True)
])

df = spark.createDataFrame(data, schema)

df.show(10, truncate=False)
df.printSchema()


# Solution

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [0]:
# 1. Calculate total daily revenue per country.

df.groupBy("country","transaction_date").agg(
    F.sum("transaction_amount").alias("Total_Revenue")
)\
    .orderBy("transaction_date")\
    .display()

In [0]:
# 2. For each country and day, return the top 3 users by revenue.

df = df.dropDuplicates()
df_agg = df.groupBy("country","user_id", "transaction_date").agg(F.sum("transaction_amount").alias("sum"))

window_spec = Window.partitionBy("country")\
    .orderBy(F.col("transaction_date").desc())

df_agg.withColumn("rn", F.row_number().over(window_spec)).filter(F.col("rn") <= 3)\
    .orderBy("country","transaction_date",F.col("sum").desc())\
    .drop("rn")\
    .display()