### Patients
patient_id, name,age, arrival_date, departure_date, service, satisfaction
### Services Weekly
week, month, service, available_beds,patients_request, patients_admitted,
patients_refused, patient_satisfaction, staff_morale,event
### Staff Schedule
week, staff_id, staff_name, role, service, present
### Staff
staff_id,staff_name,role,service

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

pio.renderers.default = "notebook_connected"
# pio.renderers.default = "iframe"

# Do not truncate tables
pd.set_option('display.max_columns', None)

### 0. Data Loading

In [18]:
PATIENTS_CSV = '../data/patients.csv'
SERVICES_CSV = '../data/services_weekly.csv'
STAFF_CSV = '../data/staff.csv'
SCHEDULE_CSV = '../data/staff_schedule.csv'

df_patients = pd.read_csv(PATIENTS_CSV, delimiter=',', low_memory=False)
df_staff = pd.read_csv(STAFF_CSV, delimiter=',', low_memory=False)
df_staff_schedule = pd.read_csv(SCHEDULE_CSV, delimiter=',', low_memory=False)
df_services_weekly = pd.read_csv(SERVICES_CSV, delimiter=',', low_memory=False)

dfs = [df_patients, df_services_weekly, df_staff, df_staff_schedule]

### 1. Data Types

In [19]:
df_patients['arrival_date'] = pd.to_datetime(df_patients['arrival_date'], errors='coerce')
df_patients['departure_date'] = pd.to_datetime(df_patients['departure_date'], errors='coerce')

# Convert int columns to numeric
df_patients['age'] = pd.to_numeric(df_patients['age'], errors='coerce')
df_patients['satisfaction'] = pd.to_numeric(df_patients['satisfaction'], errors='coerce')

df_staff_schedule['week'] = pd.to_numeric(df_staff_schedule['week'], errors='coerce')
df_staff_schedule['present'] = pd.to_numeric(df_staff_schedule['present'], errors='coerce')

numeric_cols = [
    'week', 'month', 'available_beds', 'patients_request', 'patients_admitted',
    'patients_refused', 'patient_satisfaction', 'staff_morale'
]
for col in numeric_cols:
    df_services_weekly[col] = pd.to_numeric(df_services_weekly[col], errors='coerce')

### 2. Check for empty values

In [20]:
def find_empty_values(df: pd.DataFrame) -> dict[str, int]:
    empty_values_by_column: dict[str, int] = dict()
    for column in df.columns:
        empty_values_by_column[column] = df[column].isnull().sum()
    return empty_values_by_column

def print_empty_values(empty_values_by_column: dict[str, int]) -> None:
    print("Empty values by column:")
    for column, empty_count in empty_values_by_column.items():
        print(f"{column}: {empty_count}")

In [21]:
exist_empty = False
for df in dfs:
    empty_values = find_empty_values(df)
    if any(count > 0 for count in empty_values.values()):
        exist_empty = True
        print_empty_values(empty_values)

if not exist_empty:
    print("No empty values found in any dataset.")

No empty values found in any dataset.


### 3. Check invalid data

In [22]:
def check_invalid_dates(df: pd.DataFrame, date_columns: list[str]) -> None:
    for column in date_columns:
        invalid_dates = df[pd.to_datetime(df[column], errors='coerce').isna()]
        if not invalid_dates.empty:
            print(f"Invalid dates found in column {column}:\n{invalid_dates}")

def check_negative(df: pd.DataFrame, column: str) -> None:
    negative_values = df[df[column] < 0]
    if not negative_values.empty:
        print(f"Negative values found in column {column}:\n{negative_values}")

def check_out_of_range(df: pd.DataFrame, column: str, min_value: float, max_value: float) -> None:
    out_of_range_values = df[(df[column] < min_value) | (df[column] > max_value)]
    if not out_of_range_values.empty:
        print(f"Out of range values found in column {column}:\n{out_of_range_values}")

def check_id(df: pd.DataFrame, id_column: str, person_type: str) -> None:
    # Patient IDs start with PAT, Staff IDs start with STF
    code = 'PAT' if person_type == 'patient' else 'STF'
    invalid_code_ids = df[~df[id_column].str.startswith(code, na=False)]

    # All IDs are in the form `{code}-{8 digit/letter combination}`
    invalid_format_ids = invalid_code_ids[~invalid_code_ids[id_column].str.match(rf'^{code}-[A-Za-z0-9]{{8}}$', na=False)]

    if not invalid_format_ids.empty:
        print(f"Invalid {person_type} IDs found in column {id_column}:\n{invalid_format_ids}")

In [23]:
# Check for negative ages in patients dataset
check_negative(df_patients, 'age')
# Check for invalid dates in patients dataset
check_invalid_dates(df_patients, ['arrival_date', 'departure_date'])
# Check weeks in range 1-52 in services_weekly dataset
check_out_of_range(df_services_weekly, 'week', 1, 52)
# Check months in range 1-12 in services_weekly dataset
check_out_of_range(df_services_weekly, 'month', 1, 12)
# Check patient IDs in patients dataset
check_id(df_patients, 'patient_id', 'patient')
# Check staff IDs in staff dataset
check_id(df_staff, 'staff_id', 'staff')

### 4. Check for duplicates

In [24]:
def check_duplicates(df: pd.DataFrame, df_name: str) -> None:
    duplicates = df[df.duplicated()]
    if not duplicates.empty:
        print(f"Duplicate rows found in {df_name}:\n{duplicates}")

for index, df in enumerate(dfs):
    check_duplicates(df, f"dataset_{index}")

### 5. Add useful fields

In [25]:
df_patients['length_of_stay'] = (pd.to_datetime(df_patients['departure_date']) - pd.to_datetime(df_patients['arrival_date'])).dt.days
df_patients['week'] = pd.to_datetime(df_patients['arrival_date']).dt.isocalendar().week
df_patients['month'] = pd.to_datetime(df_patients['arrival_date']).dt.month

staff_presence = df_staff_schedule.groupby(['service', 'week']).agg(
    staff_present_total=('present', 'sum'),
    doctors_present=('role', lambda x: ((x == 'doctor') & (df_staff_schedule.loc[x.index, 'present'] == 1)).sum()),
    nurses_present=('role', lambda x: ((x == 'nurse') & (df_staff_schedule.loc[x.index, 'present'] == 1)).sum()),
    assistants_present=('role', lambda x: ((x == 'nursing_assistant') & (df_staff_schedule.loc[x.index, 'present'] == 1)).sum())
).reset_index()

In [26]:
staff_presence.head(5)

Unnamed: 0,service,week,staff_present_total,doctors_present,nurses_present,assistants_present
0,ICU,1,31,6,17,8
1,ICU,2,30,6,17,7
2,ICU,3,0,0,0,0
3,ICU,4,29,5,16,8
4,ICU,5,28,5,15,8
