In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
# Loading a dataset:
healthcare_dataframe = pd.read_csv('healthcare_dataset.csv')

In [32]:
# Reading first 10 rows of data
healthcare_dataframe.head(10)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal
5,EMILY JOHNSOn,36,Male,A+,Asthma,2023-12-20,Taylor Newton,Nunez-Humphrey,UnitedHealthcare,48145.110951,389,Urgent,2023-12-24,Ibuprofen,Normal
6,edwArD EDWaRDs,21,Female,AB-,Diabetes,2020-11-03,Kelly Olson,Group Middleton,Medicare,19580.872345,389,Emergency,2020-11-15,Paracetamol,Inconclusive
7,CHrisTInA MARtinez,20,Female,A+,Cancer,2021-12-28,Suzanne Thomas,"Powell Robinson and Valdez,",Cigna,45820.462722,277,Emergency,2022-01-07,Paracetamol,Inconclusive
8,JASmINe aGuIlaR,82,Male,AB+,Asthma,2020-07-01,Daniel Ferguson,Sons Rich and,Cigna,50119.222792,316,Elective,2020-07-14,Aspirin,Abnormal
9,ChRISTopher BerG,58,Female,AB-,Cancer,2021-05-23,Heather Day,Padilla-Walker,UnitedHealthcare,19784.631062,249,Elective,2021-06-22,Paracetamol,Inconclusive


In [33]:
# Retrieving information about fields
healthcare_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [34]:
# Casting Date of Admission type to date
healthcare_dataframe['Date of Admission'] = pd.to_datetime(healthcare_dataframe['Date of Admission'])

# Creating Admission Year field
healthcare_dataframe['Admission Year'] = healthcare_dataframe['Date of Admission'].dt.year

In [35]:
# Obtaining preliminary understanding of numeric fields
healthcare_dataframe.describe()

Unnamed: 0,Age,Date of Admission,Billing Amount,Room Number,Admission Year
count,55500.0,55500,55500.0,55500.0,55500.0
mean,51.539459,2021-11-01 01:02:22.443243008,25539.316097,301.134829,2021.334631
min,13.0,2019-05-08 00:00:00,-2008.49214,101.0,2019.0
25%,35.0,2020-07-28 00:00:00,13241.224652,202.0,2020.0
50%,52.0,2021-11-01 00:00:00,25538.069376,302.0,2021.0
75%,68.0,2023-02-03 00:00:00,37820.508436,401.0,2023.0
max,89.0,2024-05-07 00:00:00,52764.276736,500.0,2024.0
std,19.602454,,14211.454431,115.243069,1.49731


In [36]:
# Retrieving details about medical conditions 
healthcare_dataframe["Medical Condition"].value_counts()

Medical Condition
Arthritis       9308
Diabetes        9304
Hypertension    9245
Obesity         9231
Cancer          9227
Asthma          9185
Name: count, dtype: int64

## Line Plot
##### Age distribution of patients

In [37]:
sns.kdeplot(healthcare_dataframe['Age'], fill=True)
plt.title("Age Distribution (KDE)")
plt.xlabel("Age")
plt.ylabel("Density")

# save chart
plt.savefig("charts/age_distribution.png", dpi=300)
plt.close()

#show chart
plt.show()

## Clustered bar plot
##### Medical condition records per year

In [38]:
# Grouping data by year and medical condition
grouped = healthcare_dataframe.groupby(['Admission Year', 'Medical Condition']).size().unstack()

# Plotting grouped bar chart
grouped.plot(kind='bar', figsize=(12, 6), edgecolor='black')

plt.title("Medical Condition Records by Year")
plt.xlabel("Year")
plt.ylabel("Number of Records")
plt.legend(title="Condition", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# save chart
plt.savefig("charts/medical_condition_per_year_plot.png", dpi=300)
plt.close()

#show chart
plt.show()

## BoxPlot
##### Medical condition by age

In [39]:
plt.figure(figsize=(8, 6))
sns.boxplot(x="Medical Condition", y="Age", data=healthcare_dataframe, palette="Set3", hue="Medical Condition", legend=False)

# Add labels and title
plt.title("Box Plot of Ages of Patients per Medical Condition")
plt.xlabel("Medical Condition")
plt.ylabel("Age (in years)")

# save chart
plt.savefig("charts/age_per_medical_condition_box_plot.png", dpi=300)
plt.close()

#show chart
plt.show()



In [40]:
## Correlation HeatMap

# Ensure the index is not part of the correlation
correlation = healthcare_dataframe.corr(numeric_only=True)

# Plot the heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')

# save chart
plt.savefig("charts/correlation_heatmap.png", dpi=300)
plt.close()

#show chart
plt.show()
