In [3]:
!pip install pyspark

Collecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m996.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0mm
[?25hInstalling collected packages: py4j
Successfully installed py4j-0.10.9.7


In [1]:
from pyspark.sql import SparkSession

# initialize spark session
spark=SparkSession.builder \
    .appName("Spark_Batch_Processing") \
    .getOrCreate()

In [2]:
spark

### load data into an RDD and filter

In [3]:
# Load the ADMISSIONS.csv file into an RDD
rdd=spark.sparkContext.textFile("hdfs://namenode:9000/data/ADMISSIONS.csv")

In [10]:
# Extract the header (first row)
header=rdd.first()

In [17]:
# Filter out the header and split rows ino columns
rows_rdd=rdd.filter(lambda line:line !=header).map(lambda line:line.split(","))
# verify the structure of the rdd
print("First row of the RDD (as a list of columns):")
first_row=rows_rdd.first()
print(first_row)

First row of the RDD (as a list of columns):
['1', '10001', '20001', '2021-01-01 08:00:00', '2021-01-10 12:00:00', '', 'EMERGENCY', 'EMERGENCY ROOM', 'HOME', 'Medicare', 'ENGL', 'CATHOLIC', 'MARRIED', 'WHITE', '2021-01-01 07:30:00', '2021-01-01 08:30:00', 'PNEUMONIA', '0', '1']


In [18]:
# print the index and value of each column
print("Column indices and values:")
for idx,value in enumerate(first_row):
    print(f"Column {idx}: {value}")

Column indices and values:
Column 0: 1
Column 1: 10001
Column 2: 20001
Column 3: 2021-01-01 08:00:00
Column 4: 2021-01-10 12:00:00
Column 5: 
Column 6: EMERGENCY
Column 7: EMERGENCY ROOM
Column 8: HOME
Column 9: Medicare
Column 10: ENGL
Column 11: CATHOLIC
Column 12: MARRIED
Column 13: WHITE
Column 14: 2021-01-01 07:30:00
Column 15: 2021-01-01 08:30:00
Column 16: PNEUMONIA
Column 17: 0
Column 18: 1


In [21]:
# Filter rows based on admission type (e.g., "EMERGENCY")
# Assuming ADMISSION_TYPE is at index 6
filtered_rdd=rows_rdd.filter(lambda row:row[6]=="EMERGENCY")

In [22]:
# Show the filtered RDD
print("Filtered rows:")
for row in filtered_rdd.take(5):
    print(row)

Filtered rows:
['1', '10001', '20001', '2021-01-01 08:00:00', '2021-01-10 12:00:00', '', 'EMERGENCY', 'EMERGENCY ROOM', 'HOME', 'Medicare', 'ENGL', 'CATHOLIC', 'MARRIED', 'WHITE', '2021-01-01 07:30:00', '2021-01-01 08:30:00', 'PNEUMONIA', '0', '1']


### load data and filter using Spark Dataframes

In [23]:
df=spark.read.format("csv").option("header",True).option("inferSchema",True).load("hdfs://namenode:9000/data/ADMISSIONS.csv")

In [24]:
df.show()

+------+----------+-------+-------------------+-------------------+-------------------+--------------+--------------------+--------------------+---------+--------+-----------------+--------------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+
|ROW_ID|SUBJECT_ID|HADM_ID|          ADMITTIME|          DISCHTIME|          DEATHTIME|ADMISSION_TYPE|  ADMISSION_LOCATION|  DISCHARGE_LOCATION|INSURANCE|LANGUAGE|         RELIGION|MARITAL_STATUS|           ETHNICITY|          EDREGTIME|          EDOUTTIME|   DIAGNOSIS|HOSPITAL_EXPIRE_FLAG|HAS_CHARTEVENTS_DATA|
+------+----------+-------+-------------------+-------------------+-------------------+--------------+--------------------+--------------------+---------+--------+-----------------+--------------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+
|     1|     10001|  20001|2021-01-01 08:00:00|2021-01-10 12:

In [25]:
df

DataFrame[ROW_ID: int, SUBJECT_ID: int, HADM_ID: int, ADMITTIME: timestamp, DISCHTIME: timestamp, DEATHTIME: timestamp, ADMISSION_TYPE: string, ADMISSION_LOCATION: string, DISCHARGE_LOCATION: string, INSURANCE: string, LANGUAGE: string, RELIGION: string, MARITAL_STATUS: string, ETHNICITY: string, EDREGTIME: timestamp, EDOUTTIME: timestamp, DIAGNOSIS: string, HOSPITAL_EXPIRE_FLAG: int, HAS_CHARTEVENTS_DATA: int]

In [26]:
df.select('admission_type').distinct().show()

+--------------+
|admission_type|
+--------------+
|      ELECTIVE|
|     EMERGENCY|
|        URGENT|
+--------------+



In [27]:
elective_patients=df.filter(df['admission_type']=='ELECTIVE')
elective_patients.show()

+------+----------+-------+-------------------+-------------------+---------+--------------+--------------------+--------------------+---------+--------+------------+--------------+---------------+-------------------+-------------------+------------+--------------------+--------------------+
|ROW_ID|SUBJECT_ID|HADM_ID|          ADMITTIME|          DISCHTIME|DEATHTIME|ADMISSION_TYPE|  ADMISSION_LOCATION|  DISCHARGE_LOCATION|INSURANCE|LANGUAGE|    RELIGION|MARITAL_STATUS|      ETHNICITY|          EDREGTIME|          EDOUTTIME|   DIAGNOSIS|HOSPITAL_EXPIRE_FLAG|HAS_CHARTEVENTS_DATA|
+------+----------+-------+-------------------+-------------------+---------+--------------+--------------------+--------------------+---------+--------+------------+--------------+---------------+-------------------+-------------------+------------+--------------------+--------------------+
|     3|     10003|  20003|2022-06-15 10:00:00|2022-06-20 09:00:00|     NULL|      ELECTIVE|TRANSFER FROM HOS...|REHAB/DI

In [28]:
emergency_patients = df.filter(df["admission_type"] == 'EMERGENCY')
emergency_patients.show(10)

+------+----------+-------+-------------------+-------------------+---------+--------------+------------------+------------------+---------+--------+--------+--------------+---------+-------------------+-------------------+---------+--------------------+--------------------+
|ROW_ID|SUBJECT_ID|HADM_ID|          ADMITTIME|          DISCHTIME|DEATHTIME|ADMISSION_TYPE|ADMISSION_LOCATION|DISCHARGE_LOCATION|INSURANCE|LANGUAGE|RELIGION|MARITAL_STATUS|ETHNICITY|          EDREGTIME|          EDOUTTIME|DIAGNOSIS|HOSPITAL_EXPIRE_FLAG|HAS_CHARTEVENTS_DATA|
+------+----------+-------+-------------------+-------------------+---------+--------------+------------------+------------------+---------+--------+--------+--------------+---------+-------------------+-------------------+---------+--------------------+--------------------+
|     1|     10001|  20001|2021-01-01 08:00:00|2021-01-10 12:00:00|     NULL|     EMERGENCY|    EMERGENCY ROOM|              HOME| Medicare|    ENGL|CATHOLIC|       MARRIED