In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from datetime import datetime
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("CovidData").getOrCreate()

# Define Schema
schema = StructType([
    StructField("city", StringType(), True),
    StructField("days", DateType(), True),
    StructField("cases", IntegerType(), True)
])

# Define Data
data = [
    ("DELHI", "2022-01-01", 100), ("DELHI", "2022-01-02", 200), ("DELHI", "2022-01-03", 300),
    ("MUMBAI", "2022-01-01", 100), ("MUMBAI", "2022-01-02", 100), ("MUMBAI", "2022-01-03", 300),
    ("CHENNAI", "2022-01-01", 100), ("CHENNAI", "2022-01-02", 200), ("CHENNAI", "2022-01-03", 150),
    ("BANGALORE", "2022-01-01", 100), ("BANGALORE", "2022-01-02", 300), ("BANGALORE", "2022-01-03", 200),
    ("BANGALORE", "2022-01-04", 400)
]

# Convert Date Strings to DateType using list comprehension
formatted_data = [(city, datetime.strptime(date, "%Y-%m-%d").date(), cases) for city, date, cases in data]

# Create DataFrame
covid_df = spark.createDataFrame(formatted_data, schema=schema)

# Show Data
covid_df.show()

covid_df.createOrReplaceTempView("covid")



+---------+----------+-----+
|     city|      days|cases|
+---------+----------+-----+
|    DELHI|2022-01-01|  100|
|    DELHI|2022-01-02|  200|
|    DELHI|2022-01-03|  300|
|   MUMBAI|2022-01-01|  100|
|   MUMBAI|2022-01-02|  100|
|   MUMBAI|2022-01-03|  300|
|  CHENNAI|2022-01-01|  100|
|  CHENNAI|2022-01-02|  200|
|  CHENNAI|2022-01-03|  150|
|BANGALORE|2022-01-01|  100|
|BANGALORE|2022-01-02|  300|
|BANGALORE|2022-01-03|  200|
|BANGALORE|2022-01-04|  400|
+---------+----------+-----+



In [12]:
spark.sql(
    """
        with cte as (select *,
        rank() over(partition by City Order by days) as rn_days,
        rank() over(partition by City Order by cases) as rn_cases,
        rank() over(partition by City Order by days) - rank() over(partition by City Order by cases) as diff
        from covid order by city, days)
        
        select city from cte
        group by City 
        having count(distinct diff) = 1
    """
).show()

+-----+
| city|
+-----+
|DELHI|
+-----+



In [15]:
from pyspark.sql.functions import *

In [24]:
window_days = Window.partitionBy("City").orderBy("days")
window_cases = Window.partitionBy("City").orderBy("cases")

covid_df. \
    withColumn("diff", rank().over(window_days) -rank().over(window_cases)) .\
    groupBy(col("City")).agg(
        count_distinct(col("diff")).alias("cnt")
    ).filter(col("cnt") == 1).show()



+-----+---+
| City|cnt|
+-----+---+
|DELHI|  1|
+-----+---+

