In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, lag, when, row_number
from pyspark.sql.window import Window

In [2]:
# Initialize Spark session
spark=SparkSession.builder.appName("ReadmissionRiskPrediction").getOrCreate()

In [3]:
# Load the ADMISSIONS.csv file into a DataFrame
df=spark.read.format("csv") \
    .option("header",True) \
    .option("inferSchema",True) \
    .load("hdfs://namenode:9000/data/ADMISSIONS.csv")

### Readmission Risk Prediction

#### Prepare the Data

In [4]:
# Convert ADMITTIME and DISCHTIME to timestamps
df=df.withColumn("ADMITTIME",col("ADMITTIME").cast("timestamp")) \
     .withColumn("DISCHTIME", col("DISCHTIME").cast("timestamp"))

# Calculate length of stay in days
df=df.withColumn("LENGTH_OF_STAY",datediff(col("DISCHTIME"),col("ADMITTIME")))

# Calculate number of previous admissions
window_spec=Window.partitionBy("SUBJECT_ID").orderBy("ADMITTIME")
df=df.withColumn("NUM_PREV_ADMISSIONS",row_number().over(window_spec)-1)

# Create the READMISSION label (1 if readmitted within 30 days, 0 otherwise)
df=df.withColumn("READMISSION",when(datediff(lag("ADMITTIME").over(window_spec),col("DISCHTIME"))<=30,1).otherwise(0))

# Drop rows with null values (e.g., first admission for each patient)
df = df.na.drop()
# Show the prepared data
df.select("SUBJECT_ID", "HADM_ID", "LENGTH_OF_STAY", "NUM_PREV_ADMISSIONS", "READMISSION").show()

+----------+-------+--------------+-------------------+-----------+
|SUBJECT_ID|HADM_ID|LENGTH_OF_STAY|NUM_PREV_ADMISSIONS|READMISSION|
+----------+-------+--------------+-------------------+-----------+
|     10002|  20002|             5|                  0|          0|
+----------+-------+--------------+-------------------+-----------+



In [5]:
from pyspark.sql.functions import col
# Count the number of readmissions and non-readmissions
readmission_counts=df.groupBy("READMISSION").count()
readmission_counts.show()

+-----------+-----+
|READMISSION|count|
+-----------+-----+
|          0|    1|
+-----------+-----+



### Feature Engineering

In [6]:
from pyspark.ml.feature import VectorAssembler
# Define the feature columns
feature_columns=["LENGTH_OF_STAY", "NUM_PREV_ADMISSIONS"]
# Assemble features into a vector
assembler=VectorAssembler(inputCols=feature_columns,outputCol="features")
df=assembler.transform(df)

# Show the DataFrame with features
df.select("features", "READMISSION").show(truncate=False)

+---------+-----------+
|features |READMISSION|
+---------+-----------+
|[5.0,0.0]|0          |
+---------+-----------+



###  Train and Evaluate the Model

In [8]:
# Split the data into training and test sets (70% training, 30% test)
train_data,test_data=df.randomSplit([0.7,0.3],seed=42)

In [9]:
from pyspark.ml.classification import LogisticRegression
# Initialize the Logistic Regression model
lr=LogisticRegression(featuresCol="features",labelCol="READMISSION")

# Train the model
lr_model=lr.fit(train_data)


In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# make predictions on the test set
predictions=lr_model.transform(test_data)

# Initialize the evaluator
evaluator=MulticlassClassificationEvaluator(labelCol="READMISSION", predictionCol="prediction", metricName="accuracy")


# Calculate accuracy
accuracy = evaluator.evaluate(predictions)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: nan


In [12]:
#  Calculate precision
precision=evaluator.setMetricName("weightedPrecision").evaluate(predictions)
print("Precision:", precision)

# Calculate recall
recall=evaluator.setMetricName("weightedRecall").evaluate(predictions)
print("Recall:", recall)

# Calculate F1-score
f1_score=evaluator.setMetricName("f1").evaluate(predictions)
print("F1-Score:", f1_score)

Precision: 0.0
Recall: 0.0
F1-Score: 0.0


In [13]:
pred_stats=predictions.groupBy("READMISSION").count()
pred_stats.show()

+-----------+-----+
|READMISSION|count|
+-----------+-----+
+-----------+-----+



In [14]:
pred_stats2 = predictions.groupBy("prediction").count()
pred_stats2.show()

+----------+-----+
|prediction|count|
+----------+-----+
+----------+-----+



In [19]:
# Save the trained Linear Regression model to a file
model_save_path = "hdfs://namenode:9000/models/readmission_risk_prediction_lr_model"
lr_model.save(model_save_path)