In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, lag, when, row_number
from pyspark.sql.window import Window

In [2]:
# Initialize Spark session
spark=SparkSession.builder.appName("ReadmissionRiskPrediction").getOrCreate()

In [3]:
# Load the ADMISSIONS.csv file into a DataFrame
df=spark.read.format("csv") \
    .option("header",True) \
    .option("inferSchema",True) \
    .load("hdfs://namenode:9000/data/ADMISSIONS.csv")

### Readmission Risk Prediction

#### Prepare the Data

In [4]:
# Convert ADMITTIME and DISCHTIME to timestamps
df=df.withColumn("ADMITTIME",col("ADMITTIME").cast("timestamp")) \
     .withColumn("DISCHTIME", col("DISCHTIME").cast("timestamp"))

# Calculate length of stay in days
df=df.withColumn("LENGTH_OF_STAY",datediff(col("DISCHTIME"),col("ADMITTIME")))

# Calculate number of previous admissions
window_spec=Window.partitionBy("SUBJECT_ID").orderBy("ADMITTIME")
df=df.withColumn("NUM_PREV_ADMISSIONS",row_number().over(window_spec)-1)

# Create the READMISSION label (1 if readmitted within 30 days, 0 otherwise)
df=df.withColumn("READMISSION",when(datediff(lag("ADMITTIME").over(window_spec),col("DISCHTIME"))<=30,1).otherwise(0))

# Drop rows with null values (e.g., first admission for each patient)
df = df.na.drop()
# Show the prepared data
df.select("SUBJECT_ID", "HADM_ID", "LENGTH_OF_STAY", "NUM_PREV_ADMISSIONS", "READMISSION").show()

+----------+-------+--------------+-------------------+-----------+
|SUBJECT_ID|HADM_ID|LENGTH_OF_STAY|NUM_PREV_ADMISSIONS|READMISSION|
+----------+-------+--------------+-------------------+-----------+
|     10002|  20002|             5|                  0|          0|
+----------+-------+--------------+-------------------+-----------+



In [7]:
from pyspark.sql.functions import col
# Count the number of readmissions and non-readmissions
readmission_counts=df.groupBy("READMISSION").count()
readmission_counts.show()

+-----------+-----+
|READMISSION|count|
+-----------+-----+
|          0|    1|
+-----------+-----+



### Feature Engineering

In [8]:
from pyspark.ml.feature import VectorAssembler
# Define the feature columns
feature_columns=["LENGTH_OF_STAY", "NUM_PREV_ADMISSIONS"]
# Assemble features into a vector
assembler=VectorAssembler(inputCols=feature_columns,outputCol="features")
df=assembler.transform(df)

# Show the DataFrame with features
df.select("features", "READMISSION").show(truncate=False)

+---------+-----------+
|features |READMISSION|
+---------+-----------+
|[5.0,0.0]|0          |
+---------+-----------+

