In [47]:
from pyspark.sql import SparkSession, functions as F, Window as W
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType

In [48]:
spark = SparkSession.builder.appName("DailyCodingProblem-21-08-2025").getOrCreate()

# 📝 Problem 1: PySpark – Identify First Purchase per Customer

### **Problem Statement**

You have a PySpark DataFrame containing customer purchase data. Each row represents a purchase. Write a PySpark program to **find the first purchase date for each customer** and the amount spent on that date.

### **Sample Input** (`purchases`)

| customer\_id | purchase\_date | amount |
| ------------ | -------------- | ------ |
| 101          | 2025-01-03     | 250    |
| 101          | 2025-01-05     | 300    |
| 102          | 2025-01-01     | 150    |
| 102          | 2025-01-02     | 200    |
| 103          | 2025-01-04     | 500    |

### **Expected Output**

| customer\_id | first\_purchase\_date | amount |
| ------------ | --------------------- | ------ |
| 101          | 2025-01-03            | 250    |
| 102          | 2025-01-01            | 150    |
| 103          | 2025-01-04            | 500    |

---

In [49]:
purchases = [
    {"customer_id": 101, "purchase_date": "2025-01-03", "amount": 250},
    {"customer_id": 101, "purchase_date": "2025-01-05", "amount": 300},
    {"customer_id": 102, "purchase_date": "2025-01-01", "amount": 150},
    {"customer_id": 102, "purchase_date": "2025-01-02", "amount": 200},
    {"customer_id": 103, "purchase_date": "2025-01-04", "amount": 500},
]


spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(purchases)
df = df.withColumn(
  'purchase_date',
  F.to_date(F.col('purchase_date'))
)


window = W.partitionBy('customer_id').orderBy('purchase_date')

df = df.withColumn(
  'rn',
  F.row_number().over(window)
)
df = df.filter(
  F.col('rn') == 1
).orderBy(F.col('customer_id')).drop('rn')

df.show()

+------+-----------+-------------+
|amount|customer_id|purchase_date|
+------+-----------+-------------+
|   250|        101|   2025-01-03|
|   150|        102|   2025-01-01|
|   500|        103|   2025-01-04|
+------+-----------+-------------+

