In [2]:
from pyspark.sql import SparkSession, functions as F, Window

In [3]:
spark = SparkSession.builder.appName("DailyCodingProblem").getOrCreate()

# 📝 Problem 1: PySpark – Find Top N Products per Category

### **Problem Statement**

You have a PySpark DataFrame containing product sales. Each product belongs to a category, and you need to find the **top 2 products by sales amount within each category**.

### **Sample Input** (`products`)

| category    | product | sales |
| ----------- | ------- | ----- |
| Electronics | Laptop  | 1200  |
| Electronics | Phone   | 900   |
| Electronics | Tablet  | 700   |
| Clothing    | Shirt   | 400   |
| Clothing    | Jeans   | 600   |
| Clothing    | Jacket  | 800   |

### **Expected Output**

| category    | product | sales |
| ----------- | ------- | ----- |
| Electronics | Laptop  | 1200  |
| Electronics | Phone   | 900   |
| Clothing    | Jacket  | 800   |
| Clothing    | Jeans   | 600   |

---

In [4]:
data = [
    {"category": "Electronics", "product": "Laptop", "sales": 1200},
    {"category": "Electronics", "product": "Phone", "sales": 900},
    {"category": "Electronics", "product": "Tablet", "sales": 700},
    {"category": "Clothing", "product": "Shirt", "sales": 400},
    {"category": "Clothing", "product": "Jeans", "sales": 600},
    {"category": "Clothing", "product": "Jacket", "sales": 800},
]


df = spark.createDataFrame(data)
df.show()

+-----------+-------+-----+
|   category|product|sales|
+-----------+-------+-----+
|Electronics| Laptop| 1200|
|Electronics|  Phone|  900|
|Electronics| Tablet|  700|
|   Clothing|  Shirt|  400|
|   Clothing|  Jeans|  600|
|   Clothing| Jacket|  800|
+-----------+-------+-----+



In [8]:
w = Window.partitionBy("category").orderBy(F.col("sales").desc())

In [9]:
df = df.withColumn(
    "id",
    F.row_number().over(w)
)


df.show()

+-----------+-------+-----+---+
|   category|product|sales| id|
+-----------+-------+-----+---+
|   Clothing| Jacket|  800|  1|
|   Clothing|  Jeans|  600|  2|
|   Clothing|  Shirt|  400|  3|
|Electronics| Laptop| 1200|  1|
|Electronics|  Phone|  900|  2|
|Electronics| Tablet|  700|  3|
+-----------+-------+-----+---+



In [11]:
df = df.filter(F.col("id") <= 2).drop(F.col("id")).orderBy(F.col("sales").desc())

df.show()

+-----------+-------+-----+
|   category|product|sales|
+-----------+-------+-----+
|Electronics| Laptop| 1200|
|Electronics|  Phone|  900|
|   Clothing| Jacket|  800|
|   Clothing|  Jeans|  600|
+-----------+-------+-----+



# 📝 Problem 2: SQL – Calculate Running Balance

### **Problem Statement**

You are given a SQL table `transactions(user_id, txn_date, amount)` where **amount** can be positive (credit) or negative (debit). Write a SQL query to calculate the **running balance** for each user ordered by `txn_date`.

### **Sample Input** (`transactions`)

| user\_id | txn\_date  | amount |
| -------- | ---------- | ------ |
| 1        | 2025-01-01 | 500    |
| 1        | 2025-01-03 | -200   |
| 1        | 2025-01-05 | 300    |
| 2        | 2025-01-02 | 1000   |
| 2        | 2025-01-04 | -400   |

### **Expected Output**

| user\_id | txn\_date  | amount | running\_balance |
| -------- | ---------- | ------ | ---------------- |
| 1        | 2025-01-01 | 500    | 500              |
| 1        | 2025-01-03 | -200   | 300              |
| 1        | 2025-01-05 | 300    | 600              |
| 2        | 2025-01-02 | 1000   | 1000             |
| 2        | 2025-01-04 | -400   | 600              |

---

In [12]:
data = [
    {"user_id": 1, "txn_date": "2025-01-01", "amount": 500},
    {"user_id": 1, "txn_date": "2025-01-03", "amount": -200},
    {"user_id": 1, "txn_date": "2025-01-05", "amount": 300},
    {"user_id": 2, "txn_date": "2025-01-02", "amount": 1000},
    {"user_id": 2, "txn_date": "2025-01-04", "amount": -400},
]


df = spark.createDataFrame(data)

df = df.withColumn(
  "txn_date",
  F.to_date(F.col("txn_date"))
)


df.show()

+------+----------+-------+
|amount|  txn_date|user_id|
+------+----------+-------+
|   500|2025-01-01|      1|
|  -200|2025-01-03|      1|
|   300|2025-01-05|      1|
|  1000|2025-01-02|      2|
|  -400|2025-01-04|      2|
+------+----------+-------+



In [13]:
df.printSchema()

root
 |-- amount: long (nullable = true)
 |-- txn_date: date (nullable = true)
 |-- user_id: long (nullable = true)



In [14]:
w = Window.partitionBy("user_id").orderBy("txn_date")

In [16]:
df = df.withColumn(
    "running_balance",
    F.sum(F.col("amount")).over(w.rowsBetween(Window.unboundedPreceding, Window.currentRow))
)

df.show()

+------+----------+-------+---------------+
|amount|  txn_date|user_id|running_balance|
+------+----------+-------+---------------+
|   500|2025-01-01|      1|            500|
|  -200|2025-01-03|      1|            300|
|   300|2025-01-05|      1|            600|
|  1000|2025-01-02|      2|           1000|
|  -400|2025-01-04|      2|            600|
+------+----------+-------+---------------+

