In [50]:
from pyspark.sql import SparkSession, functions as F, Window as W
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType

In [51]:
spark = SparkSession.builder.appName("DailyCodingProblem-22-08-2025").getOrCreate()

# 📝 Problem 1: PySpark – Detect Outlier Transactions per Day

### **Problem Statement**

You have a PySpark DataFrame with daily transaction amounts. For each day, identify **transactions greater than the average amount for that day** (outliers).

### **Sample Input** (`transactions`)

| txn\_date  | txn\_id | amount |
| ---------- | ------- | ------ |
| 2025-01-01 | T1      | 100    |
| 2025-01-01 | T2      | 200    |
| 2025-01-01 | T3      | 500    |
| 2025-01-02 | T4      | 300    |
| 2025-01-02 | T5      | 400    |
| 2025-01-02 | T6      | 600    |

### **Expected Output**

| txn\_date  | txn\_id | amount |
| ---------- | ------- | ------ |
| 2025-01-01 | T3      | 500    |
| 2025-01-02 | T6      | 600    |

---

In [52]:
schema = StructType([
    StructField("txn_date", StringType(), True),
    StructField("txn_id", StringType(), True),
    StructField("amount", IntegerType(), True)
])

data = [
    ("2025-01-01", "T1", 100),
    ("2025-01-01", "T2", 200),
    ("2025-01-01", "T3", 500),
    ("2025-01-02", "T4", 300),
    ("2025-01-02", "T5", 400),
    ("2025-01-02", "T6", 600),
]


df = spark.createDataFrame(data, schema=schema)
df = df.withColumn(
    'txn_date',
    F.to_date(F.col("txn_date"))
)
df.show()

+----------+------+------+
|  txn_date|txn_id|amount|
+----------+------+------+
|2025-01-01|    T1|   100|
|2025-01-01|    T2|   200|
|2025-01-01|    T3|   500|
|2025-01-02|    T4|   300|
|2025-01-02|    T5|   400|
|2025-01-02|    T6|   600|
+----------+------+------+



In [53]:
w = W.partitionBy("txn_date").orderBy(F.col("amount").desc()).rowsBetween(W.unboundedPreceding, W.unboundedFollowing)

In [54]:
df = df.withColumn(
    'avg_amount',
    F.avg(F.col("amount")).over(w)
).filter(F.col("amount") > F.col("avg_amount")).drop("avg_amount")

df.show()

+----------+------+------+
|  txn_date|txn_id|amount|
+----------+------+------+
|2025-01-01|    T3|   500|
|2025-01-02|    T6|   600|
+----------+------+------+

