# Healthcare Data Analyst Project â€” EDA
Synthetic dataset for KPIs, readmissions, LOS, and quality proxies.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
patients = pd.read_csv('patients.csv')
encounters = pd.read_csv('encounters.csv', parse_dates=['admit_datetime','discharge_datetime'])
diagnoses = pd.read_csv('diagnoses.csv')
labs = pd.read_csv('labs.csv', parse_dates=['collected_datetime'])
meds = pd.read_csv('medications.csv')
patients.head(), encounters.head()


In [None]:
# KPIs
readmission_rate = encounters['readmitted_within_30d'].mean()*100
avg_los = encounters.loc[encounters['encounter_type']=='Inpatient','length_of_stay_days'].mean()
print(f'Readmission Rate: {readmission_rate:.2f}%')
print(f'Average LOS (days): {avg_los:.2f}')


In [None]:
# Charges by encounter type
charges_by_type = encounters.groupby('encounter_type')['total_charges'].sum().sort_values(ascending=False)
charges_by_type.plot(kind='bar', title='Total Charges by Encounter Type')
plt.ylabel('Total Charges ($)'); plt.tight_layout(); plt.show()


In [None]:
# Readmission rate by primary diagnosis (top 10)
pri = diagnoses[diagnoses['priority']=='P'].copy()
merged = pri.merge(encounters[['encounter_id','readmitted_within_30d']], on='encounter_id', how='left')
readmit_by_dx = merged.groupby('description')['readmitted_within_30d'].mean().sort_values(ascending=False).head(10)
readmit_by_dx.plot(kind='barh', title='Readmission Rate by Primary Diagnosis (Top 10)'); plt.tight_layout(); plt.show()
