In [1]:
import pandas as pd
import numpy as np

In [3]:
patient_visits = pd.read_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/raw/patient_visits.csv", parse_dates=["date"])
admissions = pd.read_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/raw/admissions.csv", parse_dates=["date"])
beds = pd.read_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/raw/beds.csv", parse_dates=["date"])
staffing = pd.read_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/raw/staffing.csv", parse_dates=["date"])
wait_times = pd.read_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/raw/wait_times.csv", parse_dates=["date"])
weather = pd.read_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/raw/weather.csv", parse_dates=["date"])


In [5]:
def clean_columns(df):
    df.columns = (
        df.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
    )
    return df

patient_visits = clean_columns(patient_visits)
admissions = clean_columns(admissions)
beds = clean_columns(beds)
staffing = clean_columns(staffing)
wait_times = clean_columns(wait_times)
weather = clean_columns(weather)


In [23]:
#Validate Primary Keys (No Duplicates)
def check_duplicates(df, keys, name):
    dup = df.duplicated(subset=keys).sum()
    print(f"{name}: duplicate rows → {dup}")

check_duplicates(patient_visits, ["date", "department"], "patient_visits")
check_duplicates(admissions, ["date", "department"], "admissions")
check_duplicates(beds, ["date", "department"], "beds")
check_duplicates(staffing, ["date", "department"], "staffing")
check_duplicates(wait_times, ["date", "department"], "wait_times")
check_duplicates(weather, ["date"], "weather")


patient_visits: duplicate rows → 0
admissions: duplicate rows → 0
beds: duplicate rows → 0
staffing: duplicate rows → 0
wait_times: duplicate rows → 0
weather: duplicate rows → 0


In [25]:
#Patient & Flow Metrics
patient_visits["patient_count"] = patient_visits["patient_count"].clip(lower=0)

admissions[["admissions", "discharges"]] = (
    admissions[["admissions", "discharges"]].fillna(0).astype(int)
)


In [11]:
#Bed Occupancy Rules
beds["beds_occupied"] = beds[["beds_occupied", "total_beds"]].min(axis=1)
beds["beds_occupied"] = beds["beds_occupied"].clip(lower=0)


In [27]:
#Staffing Safety
staffing["staff_on_duty"] = staffing["staff_on_duty"].clip(lower=1)


In [29]:
#Wait Time Logic
wait_times["avg_wait_time"] = wait_times["avg_wait_time"].clip(lower=5)


In [17]:
#Weather Cleanup
weather["rain_flag"] = weather["rain_flag"].fillna(0).astype(int)


In [31]:
#Departments Alignment
dept_sets = [
    set(patient_visits["department"]),
    set(admissions["department"]),
    set(beds["department"]),
    set(staffing["department"]),
    set(wait_times["department"])
]

assert all(dept_sets[0] == s for s in dept_sets), "Department mismatch!"


In [33]:
print("Date range:")
print(patient_visits["date"].min(), "→", patient_visits["date"].max())


Date range:
2023-01-01 00:00:00 → 2024-12-31 00:00:00


In [35]:
patient_visits.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/processed/patient_visits_clean.csv", index=False)
admissions.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/processed/admissions_clean.csv", index=False)
beds.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/processed/beds_clean.csv", index=False)
staffing.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/processed/staffing_clean.csv", index=False)
wait_times.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/processed/wait_times_clean.csv", index=False)
weather.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/Healthcare-Patient-Flow-Analytics/data/processed/weather_clean.csv", index=False)