In [0]:
%sql

select * from rwd.silver.primary_cancer_condition;

Patient_ID,Primary_Cancer_Condition,Diagnosis_Date,Hospital
P005,Hepatocellular Carcinoma,2024-07-14,Narayana Health
P008,Pancreatic Cancer,2024-03-18,Manipal Hospitals
P001,Breast Carcinoma,2024-08-12,Apollo Hospitals
P009,Gastric Cancer,2024-04-10,Sankara Nethralaya
P003,Colorectal Cancer,2024-05-19,Fortis Chennai
P002,Lung Adenocarcinoma,2023-11-03,AIIMS Delhi
P006,Ovarian Cancer,2023-12-01,Max Healthcare
P010,Bladder Cancer,2023-10-05,Tata Memorial
P004,Prostate Cancer,2023-09-25,CMC Vellore
P007,Melanoma,2024-06-22,KIMS Hyderabad


In [0]:

from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:

df = spark.read.table("rwd.silver.primary_cancer_condition")

In [0]:
w = (
    Window
        .partitionBy("patient_id")
        .orderBy("Diagnosis_Date")
)

df_with_prev = (
    df.withColumn("previous_diagnosis", lag("Diagnosis_Date").over(w))
      .withColumn(
          "days_since_last_visit",
          datediff("Diagnosis_Date", "previous_diagnosis")
      )
      .withColumn(
          "is_readmission_30d",
          when(col("days_since_last_visit") <= 30, 1).otherwise(0)
      )
)


In [0]:
display(df_with_prev)

Patient_ID,Primary_Cancer_Condition,Diagnosis_Date,Hospital,previous_diagnosis,days_since_last_visit,is_readmission_30d
P001,Breast Carcinoma,2024-08-12,Apollo Hospitals,,,0
P002,Lung Adenocarcinoma,2023-11-03,AIIMS Delhi,,,0
P003,Colorectal Cancer,2024-05-19,Fortis Chennai,,,0
P004,Prostate Cancer,2023-09-25,CMC Vellore,,,0
P005,Hepatocellular Carcinoma,2024-07-14,Narayana Health,,,0
P006,Ovarian Cancer,2023-12-01,Max Healthcare,,,0
P007,Melanoma,2024-06-22,KIMS Hyderabad,,,0
P008,Pancreatic Cancer,2024-03-18,Manipal Hospitals,,,0
P009,Gastric Cancer,2024-04-10,Sankara Nethralaya,,,0
P010,Bladder Cancer,2023-10-05,Tata Memorial,,,0
