In [11]:
from pyspark.sql import SparkSession, functions as F, Window

In [4]:
spark = SparkSession.builder.appName("DailyCodingProblem").getOrCreate()

# 📝 Problem 1: PySpark – Detect Missing Dates in a Time Series

### **Problem Statement**

You are given a PySpark DataFrame containing daily sales records. However, some dates are missing. Write a PySpark program to **identify all the missing dates** within the given range of dates.

### **Sample Input** (DataFrame `sales`)

| date       | sales |
| ---------- | ----- |
| 2025-01-01 | 100   |
| 2025-01-02 | 150   |
| 2025-01-04 | 120   |
| 2025-01-06 | 200   |

### **Expected Output**

| missing\_date |
| ------------- |
| 2025-01-03    |
| 2025-01-05    |

---


In [28]:
data = [
  {
    "date": "2025-01-01",
    "sales": 100
  },
  {
    "date": "2025-01-02",
    "sales": 150
  },
  {
    "date": "2025-01-04",
    "sales": 120
  },
  {
    "date": "2025-01-06",
    "sales": 200
  }
]


df = spark.createDataFrame(data)

df = df.withColumn(
  "actual_date",
  F.to_date(F.col("date"))
)

date_bounds = df.select(
    F.min("actual_date").alias("min_date"),
    F.max("actual_date").alias("max_date")
)

first_row = date_bounds.first()
first_date = first_row["min_date"]


last_row = date_bounds.first()
last_date = last_row["max_date"]

date_sequence_df = spark.range(1).withColumn(
  "full_range", 
  F.sequence(F.lit(first_date), F.lit(last_date), F.expr("INTERVAL 1 DAY"))
)
date_sequence_df = date_sequence_df.select("full_range")
date_sequence_df = date_sequence_df.withColumn(
  "full_range",
  F.explode(F.col("full_range"))
)

missing_dates_df = date_sequence_df.join(
  df,
  date_sequence_df.full_range == df.actual_date,
  "left_anti"
).withColumnRenamed(
  "full_range",
  "missing_dates"
)

missing_dates_df = missing_dates_df.orderBy(F.asc(F.col("missing_dates")))

missing_dates_df.show()

+-------------+
|missing_dates|
+-------------+
|   2025-01-03|
|   2025-01-05|
+-------------+



# 📝 Problem 2: SQL – Find the Longest Streak of Active Days per User

### **Problem Statement**

You are given a SQL table `user_logins(user_id, login_date)`. Write a SQL query to find the **longest streak of consecutive login days** for each user.

### **Sample Input** (`user_logins`)

| user\_id | login\_date |
| -------- | ----------- |
| 1        | 2025-01-01  |
| 1        | 2025-01-02  |
| 1        | 2025-01-04  |
| 2        | 2025-01-01  |
| 2        | 2025-01-02  |
| 2        | 2025-01-03  |
| 2        | 2025-01-05  |

### **Expected Output**

| user\_id | longest\_streak |
| -------- | --------------- |
| 1        | 2               |
| 2        | 3               |

---

In [23]:
data = [
    {"user_id": 1, "login_date": "2025-01-01"},
    {"user_id": 1, "login_date": "2025-01-02"},
    {"user_id": 1, "login_date": "2025-01-04"},
    {"user_id": 2, "login_date": "2025-01-01"},
    {"user_id": 2, "login_date": "2025-01-02"},
    {"user_id": 2, "login_date": "2025-01-03"},
    {"user_id": 2, "login_date": "2025-01-05"},
]


df = spark.createDataFrame(data)

df = df.withColumn(
  "login_date",
  F.to_date(F.col("login_date"))
)



In [24]:
df.printSchema()

root
 |-- login_date: date (nullable = true)
 |-- user_id: long (nullable = true)



In [25]:
w = Window.partitionBy("user_id").orderBy("login_date")

In [26]:
df = df.withColumn("prev_date", F.lag("login_date").over(w)) \
       .withColumn("gap", F.when(F.datediff("login_date", F.col("prev_date")) > 1, 1).otherwise(0))


df = df.filter(F.col("gap") == 0)



df = df.groupby("user_id").agg(
    F.count("user_id").alias("longest_streak")
)


df.show()

+-------+--------------+
|user_id|longest_streak|
+-------+--------------+
|      1|             2|
|      2|             3|
+-------+--------------+

