## Spark Session

In [2]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
spark=(SparkSession
        .builder
        .appName("SparkApp")
        .getOrCreate())


25/05/07 07:19:01 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/05/07 07:19:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/07 07:19:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.read.csv("/home/daman/Downloads/healthcaredata/healthcare_dataset.csv",header=True).show()

+-------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|               Name|Age|Gender|Blood Type|Medical Condition|Date of Admission|          Doctor|            Hospital|Insurance Provider|    Billing Amount|Room Number|Admission Type|Discharge Date| Medication|Test Results|
+-------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|      Bobby JacksOn| 30|  Male|        B-|           Cancer|       2024-01-31|   Matthew Smith|     Sons and Miller|        Blue Cross|18856.281305978155|        328|        Urgent|    2024-02-02|Paracetamol|      Normal|
|       LesLie TErRy| 62|  Male|        A+|          Obesity|       2019-08-20| Samantha Davies|            

In [4]:
import pandas as pd
df=pd.read_csv("/home/daman/Downloads/healthcaredata/healthcare_dataset.csv")
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce')

df_2019 = df[df['Date of Admission'].dt.year == 2019]
print(df_2019.head())

               Name  Age  Gender Blood Type Medical Condition  \
1      LesLie TErRy   62    Male         A+           Obesity   
12    connOR HANsEn   75  Female         A+          Diabetes   
30  ThOMAS MartInEZ   34    Male         B-            Asthma   
31  JAmES pattERson   23  Female         A+         Arthritis   
36    DEnIse ToRRES   33    Male        AB+          Diabetes   

   Date of Admission            Doctor                   Hospital  \
1         2019-08-20   Samantha Davies                    Kim Inc   
12        2019-12-12  Kenneth Fletcher  Powers Miller, and Flores   
30        2019-08-18       Jacob Huynh                   Hart Ltd   
31        2019-11-03  Kristina Frazier              Cruz-Santiago   
36        2019-10-14       Laura Myers                 LLC Martin   

   Insurance Provider  Billing Amount  Room Number Admission Type  \
1            Medicare    33643.327287          265      Emergency   
12              Cigna    43282.283358          134      

## Breakdown by year

In [5]:

for year in df['Date of Admission'].dt.year.unique():
    df[df['Date of Admission'].dt.year==year].to_csv(f"healthcare_{year}.csv",index=False)

In [6]:
df_2019 = spark.read.csv("healthcare_2019.csv", header=True, inferSchema=True)
df_2019.show()

+--------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|                Name|Age|Gender|Blood Type|Medical Condition|Date of Admission|          Doctor|            Hospital|Insurance Provider|    Billing Amount|Room Number|Admission Type|Discharge Date| Medication|Test Results|
+--------------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|        LesLie TErRy| 62|  Male|        A+|          Obesity|       2019-08-20| Samantha Davies|             Kim Inc|          Medicare|33643.327286577885|        265|     Emergency|    2019-08-26|  Ibuprofen|Inconclusive|
|       connOR HANsEn| 75|Female|        A+|         Diabetes|       2019-12-12|Kenneth Fletcher|Powers 

## Reading Incrementally by Year using spark

In [7]:
years = [2019,2020,2021,2022,2023,2024]
for year in years:
    df_year=spark.read.csv(f"healthcare_{year}.csv",header=True,inferSchema=True)
    df_year.show(1)

+------------+---+------+----------+-----------------+-----------------+---------------+--------+------------------+------------------+-----------+--------------+--------------+----------+------------+
|        Name|Age|Gender|Blood Type|Medical Condition|Date of Admission|         Doctor|Hospital|Insurance Provider|    Billing Amount|Room Number|Admission Type|Discharge Date|Medication|Test Results|
+------------+---+------+----------+-----------------+-----------------+---------------+--------+------------------+------------------+-----------+--------------+--------------+----------+------------+
|LesLie TErRy| 62|  Male|        A+|          Obesity|       2019-08-20|Samantha Davies| Kim Inc|          Medicare|33643.327286577885|        265|     Emergency|    2019-08-26| Ibuprofen|Inconclusive|
+------------+---+------+----------+-----------------+-----------------+---------------+--------+------------------+------------------+-----------+--------------+--------------+----------+----

## Star Schema

In [13]:
df['Discharge Date']=pd.to_datetime(df['Discharge Date'],errors='coerce')

### Dimension Table


In [None]:
patient=df[['Name','Age','Gender','Blood Type']].drop_duplicates().reset_index(drop=True)
patient['PatientID']=patient.index+1
display(patient)


Unnamed: 0,Name,Age,Gender,Blood Type,PatientID
0,Bobby JacksOn,30,Male,B-,1
1,LesLie TErRy,62,Male,A+,2
2,DaNnY sMitH,76,Female,A-,3
3,andrEw waTtS,28,Female,O+,4
4,adrIENNE bEll,43,Female,AB+,5
...,...,...,...,...,...
54961,eLIZABeTH jaCkSOn,42,Female,O+,54962
54962,KYle pEREz,61,Female,AB-,54963
54963,HEATher WaNG,38,Female,B+,54964
54964,JENniFER JOneS,43,Male,O-,54965


In [42]:
doctor=df[['Doctor']].drop_duplicates().reset_index(drop=True)
doctor['DoctorID']=doctor.index+1
print(doctor.head())

             Doctor  DoctorID
0     Matthew Smith         1
1   Samantha Davies         2
2  Tiffany Mitchell         3
3       Kevin Wells         4
4    Kathleen Hanna         5


In [43]:
hospital=df[['Hospital']].drop_duplicates().reset_index(drop=True)
hospital['HospitalID']=hospital.index+1
print(hospital.head())

                     Hospital  HospitalID
0             Sons and Miller           1
1                     Kim Inc           2
2                    Cook PLC           3
3  Hernandez Rogers and Vang,           4
4                 White-White           5


In [44]:
insurance=df[['Insurance Provider']].drop_duplicates().reset_index(drop=True)
insurance['InsuranceID']=insurance.index+1
print(insurance.head())

  Insurance Provider  InsuranceID
0         Blue Cross            1
1           Medicare            2
2              Aetna            3
3   UnitedHealthcare            4
4              Cigna            5


In [45]:
medication=df[['Medication']].drop_duplicates().reset_index(drop=True)
medication['MedicationID']=medication.index+1
print(medication.head())

    Medication  MedicationID
0  Paracetamol             1
1    Ibuprofen             2
2      Aspirin             3
3   Penicillin             4
4      Lipitor             5


In [None]:
date=df[['Date of Admission']].drop_duplicates().reset_index(drop=True)
date['DateID']=date.index+1
print(date.head())