## Spark Session

In [75]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark=(SparkSession
        .builder
        .appName("SparkApp")
        .config("spark.jars", "/home/daman/Downloads/postgresql-42.7.5.jar")
        .getOrCreate())


In [76]:
spark.read.csv("/home/daman/Downloads/healthcaredata/healthcare_dataset.csv",header=True).show()

+-------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|               Name|Age|Gender|Blood Type|Medical Condition|Date of Admission|          Doctor|            Hospital|Insurance Provider|    Billing Amount|Room Number|Admission Type|Discharge Date| Medication|Test Results|
+-------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|      Bobby JacksOn| 30|  Male|        B-|           Cancer|       2024-01-31|   Matthew Smith|     Sons and Miller|        Blue Cross|18856.281305978155|        328|        Urgent|    2024-02-02|Paracetamol|      Normal|
|       LesLie TErRy| 62|  Male|        A+|          Obesity|       2019-08-20| Samantha Davies|            

In [77]:
import pandas as pd
df=pd.read_csv("/home/daman/Downloads/healthcaredata/healthcare_dataset.csv")
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce')

df_2019 = df[df['Date of Admission'].dt.year == 2019]
print(df_2019.head())

               Name  Age  Gender Blood Type Medical Condition  \
1      LesLie TErRy   62    Male         A+           Obesity   
12    connOR HANsEn   75  Female         A+          Diabetes   
30  ThOMAS MartInEZ   34    Male         B-            Asthma   
31  JAmES pattERson   23  Female         A+         Arthritis   
36    DEnIse ToRRES   33    Male        AB+          Diabetes   

   Date of Admission            Doctor                   Hospital  \
1         2019-08-20   Samantha Davies                    Kim Inc   
12        2019-12-12  Kenneth Fletcher  Powers Miller, and Flores   
30        2019-08-18       Jacob Huynh                   Hart Ltd   
31        2019-11-03  Kristina Frazier              Cruz-Santiago   
36        2019-10-14       Laura Myers                 LLC Martin   

   Insurance Provider  Billing Amount  Room Number Admission Type  \
1            Medicare    33643.327287          265      Emergency   
12              Cigna    43282.283358          134      

## Breakdown by year

In [78]:

for year in df['Date of Admission'].dt.year.unique():
    df[df['Date of Admission'].dt.year==year].to_csv(f"healthcare_{year}.csv",index=False)

In [79]:
df_2019 = spark.read.csv("healthcare_2019.csv", header=True, inferSchema=True)
df_2019.show()

+--------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|                Name|Age|Gender|Blood Type|Medical Condition|Date of Admission|          Doctor|            Hospital|Insurance Provider|    Billing Amount|Room Number|Admission Type|Discharge Date| Medication|Test Results|
+--------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|        LesLie TErRy| 62|  Male|        A+|          Obesity|       2019-08-20| Samantha Davies|             Kim Inc|          Medicare|33643.327286577885|        265|     Emergency|    2019-08-26|  Ibuprofen|Inconclusive|
|       connOR HANsEn| 75|Female|        A+|         Diabetes|       2019-12-12|Kenneth Fletcher|Powers 

## Reading Incrementally by Year using spark

In [80]:
df=spark.read.csv("/home/daman/Downloads/healthcaredata/healthcare_dataset.csv",header=True)
df.show()

+-------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|               Name|Age|Gender|Blood Type|Medical Condition|Date of Admission|          Doctor|            Hospital|Insurance Provider|    Billing Amount|Room Number|Admission Type|Discharge Date| Medication|Test Results|
+-------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|      Bobby JacksOn| 30|  Male|        B-|           Cancer|       2024-01-31|   Matthew Smith|     Sons and Miller|        Blue Cross|18856.281305978155|        328|        Urgent|    2024-02-02|Paracetamol|      Normal|
|       LesLie TErRy| 62|  Male|        A+|          Obesity|       2019-08-20| Samantha Davies|            

In [104]:
years = [2019,2020,2021,2022,2023,2024]
df_all= None
for year in years:
    df_year=spark.read.csv(f"healthcare_{year}.csv",header=True,inferSchema=True)
    if df_all is None:
        df_all=df_year
    else:
        df_all=df_all.union(df_year)    
df_all = df_all.dropDuplicates()
df_all.count()

                                                                                

54966

## Star Schema

In [82]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [106]:
from pyspark.sql.types import DateType
df_all=df_all.withColumn("Discharge Date",F.col("Discharge Date").cast(DateType()))
df_all = df_all.withColumn("Date of Admission", F.col("Date of Admission").cast(DateType()))

### Dimension Table


In [84]:
df_all.toPandas()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
1,connOR HANsEn,75,Female,A+,Diabetes,2019-12-12,Kenneth Fletcher,"Powers Miller, and Flores",Cigna,43282.283358,134,Emergency,2019-12-28,Penicillin,Abnormal
2,ThOMAS MartInEZ,34,Male,B-,Asthma,2019-08-18,Jacob Huynh,Hart Ltd,Cigna,47909.128810,371,Urgent,2019-09-01,Ibuprofen,Inconclusive
3,JAmES pattERson,23,Female,A+,Arthritis,2019-11-03,Kristina Frazier,Cruz-Santiago,UnitedHealthcare,25835.323595,108,Urgent,2019-11-29,Penicillin,Abnormal
4,DEnIse ToRRES,33,Male,AB+,Diabetes,2019-10-14,Laura Myers,LLC Martin,Aetna,4397.776995,481,Urgent,2019-10-19,Paracetamol,Inconclusive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55495,JeNNIFeR WAng,25,Male,A+,Hypertension,2024-02-18,Bradley Bailey,"Shelton Scott, Castaneda and",Aetna,34377.506884,290,Urgent,2024-03-11,Paracetamol,Inconclusive
55496,ChRIs huGHeS,35,Female,AB-,Obesity,2024-02-28,Katelyn Perry,Lyons-Hansen,Blue Cross,11889.154513,128,Emergency,2024-03-14,Paracetamol,Abnormal
55497,mIsTy RICharDs,78,Female,O-,Hypertension,2024-04-10,Sarah Dyer,Garcia-Wells,Cigna,34150.133741,409,Emergency,2024-04-23,Ibuprofen,Normal
55498,briTtNeY York,43,Female,B-,Obesity,2024-02-02,Richard Matthews,Malone Ltd,Blue Cross,8128.932064,469,Emergency,2024-02-04,Aspirin,Abnormal


In [107]:

patient = df_all.select('Name', 'Age', 'Gender', 'Blood Type').dropDuplicates()
# patient.count()
patient = patient.withColumn("PatientID", F.row_number().over(Window.orderBy(F.lit(1))))

# patient.toPandas().head(50)
# patient.distinct().count()


In [108]:
doctor=df_all.select("Doctor").dropDuplicates()
# doctor.distinct().count()
doctor=doctor.withColumn("DoctorID",F.row_number().over(Window.orderBy(F.lit(1))))

# doctor.show()

In [109]:
hospital=df_all.select('Hospital').dropDuplicates()
hospital=hospital.withColumn('HospitalID',F.row_number().over(Window.orderBy(F.lit(1))))
hospital.show()

25/05/15 09:51:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 0

+--------------------+----------+
|            Hospital|HospitalID|
+--------------------+----------+
|           Smith PLC|         1|
|Holmes Reed and J...|         2|
|Castro and Smith,...|         3|
|and Bailey Ramos,...|         4|
|      Holmes-Griffin|         5|
|Turner, and Wrigh...|         6|
|          Group Park|         7|
|        Calderon LLC|         8|
|and Watts Terry, ...|         9|
|Pineda Werner, Mc...|        10|
|          Fox-Garcia|        11|
|       Dixon-Jenkins|        12|
|     Sons and Cooper|        13|
|Brown, and Weaver...|        14|
|          Obrien PLC|        15|
|     Hernandez-Smith|        16|
|        Aguilar-Bass|        17|
|          Holmes LLC|        18|
|       Franco-Sawyer|        19|
|Stewart, and Will...|        20|
+--------------------+----------+
only showing top 20 rows



In [110]:
insurance=df_all.select('Insurance Provider').dropDuplicates()
insurance=insurance.withColumn('InsuranceID',F.row_number().over(Window.orderBy(F.lit(1))))
insurance.show()

25/05/15 09:51:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------------------+-----------+
|Insurance Provider|InsuranceID|
+------------------+-----------+
|             Aetna|          1|
|        Blue Cross|          2|
|          Medicare|          3|
|             Cigna|          4|
|  UnitedHealthcare|          5|
+------------------+-----------+



25/05/15 09:51:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [111]:
medication=df_all.select('Medication').dropDuplicates()
medication=medication.withColumn('MedicationID',F.row_number().over(Window.orderBy(F.lit(1))))
medication.show()

25/05/15 09:51:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----------+------------+
| Medication|MedicationID|
+-----------+------------+
|  Ibuprofen|           1|
| Penicillin|           2|
|Paracetamol|           3|
|    Aspirin|           4|
|    Lipitor|           5|
+-----------+------------+



25/05/15 09:51:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [112]:
from pyspark.sql.functions import year, month,dayofmonth
date=df_all.select('Date of Admission').dropDuplicates()
date=date.withColumn('DateID',F.row_number().over(Window.orderBy(F.lit(1))))
date=date.withColumn('Year',year(F.col('Date of Admission')))\
         .withColumn('Month',month(F.col('Date of Admission')))\
         .withColumn('Day',dayofmonth(F.col('Date of Admission')))           
date.show()

25/05/15 09:51:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:51:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 0

+-----------------+------+----+-----+---+
|Date of Admission|DateID|Year|Month|Day|
+-----------------+------+----+-----+---+
|       2019-06-04|     1|2019|    6|  4|
|       2019-05-08|     2|2019|    5|  8|
|       2019-11-18|     3|2019|   11| 18|
|       2019-09-22|     4|2019|    9| 22|
|       2019-11-01|     5|2019|   11|  1|
|       2019-11-21|     6|2019|   11| 21|
|       2019-05-27|     7|2019|    5| 27|
|       2019-10-05|     8|2019|   10|  5|
|       2019-07-30|     9|2019|    7| 30|
|       2019-07-28|    10|2019|    7| 28|
|       2019-05-14|    11|2019|    5| 14|
|       2019-07-08|    12|2019|    7|  8|
|       2019-10-24|    13|2019|   10| 24|
|       2019-12-28|    14|2019|   12| 28|
|       2019-06-24|    15|2019|    6| 24|
|       2019-12-10|    16|2019|   12| 10|
|       2019-08-05|    17|2019|    8|  5|
|       2019-12-18|    18|2019|   12| 18|
|       2019-07-19|    19|2019|    7| 19|
|       2019-05-10|    20|2019|    5| 10|
+-----------------+------+----+---

### Fact_Table

In [114]:
fact_table = df_all \
    .join(patient, on=['Name', 'Age', 'Gender', 'Blood Type'], how='right') \
    .join(doctor, on='Doctor', how='right') \
    .join(hospital, on='Hospital', how='right') \
    .join(insurance, on='Insurance Provider', how='right') \
    .join(medication, on='Medication', how='right') \
    .join(date, on='Date of Admission', how='right')
# fact_table = fact_table.dropDuplicates()

# fact_table.count()
fact_admission = fact_table.select(
    "PatientID", "DoctorID", "HospitalID", "InsuranceID", "DateID", "MedicationID",
    F.col("Billing Amount").alias("BillingAmount"),
    F.col("Room Number").alias("RoomNumber"),
    F.col("Admission Type").alias("AdmissionType"),
    F.col("Test Results").alias("TestResult")
)

fact_admission = fact_admission.withColumn("AdmissionID", F.row_number().over( Window.orderBy("PatientID")))
fact_admission.show()

25/05/15 09:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 0

+---------+--------+----------+-----------+------+------------+------------------+----------+-------------+------------+-----------+
|PatientID|DoctorID|HospitalID|InsuranceID|DateID|MedicationID|     BillingAmount|RoomNumber|AdmissionType|  TestResult|AdmissionID|
+---------+--------+----------+-----------+------+------------+------------------+----------+-------------+------------+-----------+
|        1|    3106|      2907|          4|    36|           4|3879.0846018133934|       290|     Elective|Inconclusive|          1|
|        2|     282|      3854|          1|   216|           4|15689.567860525334|       490|       Urgent|    Abnormal|          2|
|        3|    2344|      3182|          3|   207|           3|18832.768495910397|       458|     Elective|Inconclusive|          3|
|        4|    2895|      1416|          1|   127|           3| 24605.20585914686|       242|       Urgent|Inconclusive|          4|
|        5|    2004|      3704|          4|   151|           3| 44630

## Loading in Database


In [115]:
pg_url = "jdbc:postgresql://localhost:5432/healthcare_db"
pg_properties = {
    "user": "admin",
    "password": "admin",
    "driver": "org.postgresql.Driver"
}


In [116]:
# Dimension Tables
patient.write.jdbc(url=pg_url, table="patient", mode="overwrite", properties=pg_properties)
doctor.write.jdbc(url=pg_url, table="doctor", mode="overwrite", properties=pg_properties)
hospital.write.jdbc(url=pg_url, table="hospital", mode="overwrite", properties=pg_properties)
insurance.write.jdbc(url=pg_url, table="insurance", mode="overwrite", properties=pg_properties)
medication.write.jdbc(url=pg_url, table="medication", mode="overwrite", properties=pg_properties)
date.write.jdbc(url=pg_url, table="date", mode="overwrite", properties=pg_properties)

# Fact Table
fact_admission.write.jdbc(url=pg_url, table="fact_admission", mode="overwrite", properties=pg_properties)


25/05/15 09:52:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:52:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 0

In [117]:
fact_admission.groupBy("PatientID").count().orderBy("count", ascending=False).show()


25/05/15 09:53:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:53:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:53:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:53:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:53:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 09:53:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/15 0

+---------+-----+
|PatientID|count|
+---------+-----+
|     1959|    1|
|     7253|    1|
|     5300|    1|
|     2866|    1|
|     5803|    1|
|     2659|    1|
|     1829|    1|
|     4935|    1|
|     5156|    1|
|     3997|    1|
|     2366|    1|
|     3749|    1|
|     4519|    1|
|     1645|    1|
|      833|    1|
|     5518|    1|
|     2142|    1|
|     6658|    1|
|      496|    1|
|     3918|    1|
+---------+-----+
only showing top 20 rows



## Log Script