OT Summary (Nasser)

In [None]:
import pandas as pd

file_path = "Update2_Injury_Categorization_OT.summary.20-Apr-2025 2.xlsx"
df = pd.read_excel(file_path, engine='openpyxl')

# keep your original cleaning
df['Date of Surgical Encounter'] = pd.to_datetime(df['Date of Surgical Encounter'], errors='coerce')
df = df.dropna(subset=['Date of Surgical Encounter']).copy()

injury_columns = ['CMF', 'Burn', 'Limb', 'Soft Tissue', 'Wound Care', 'NTD']

# ---------------- Biweekly bucketing anchored to first date ----------------
# Anchor start to the earliest date (normalized to midnight)
start_date = df['Date of Surgical Encounter'].min().normalize()

# compute integer biweek index (0 = first 14-day period starting at start_date)
df['biweek_idx'] = ((df['Date of Surgical Encounter'] - start_date).dt.days // 14).astype(int)

# aggregate by biweek index
biweekly = (
    df.groupby('biweek_idx')[injury_columns]
      .sum()
      .reset_index()
)

# create readable start/end labels for each biweek
biweekly['period_start'] = start_date + pd.to_timedelta(biweekly['biweek_idx'] * 14, unit='D')
biweekly['period_end'] = biweekly['period_start'] + pd.to_timedelta(13, unit='D')

# human-friendly label (change format if you prefer)
biweekly['period'] = (
    biweekly['period_start'].dt.strftime('%Y-%m-%d')
    + ' to '
    + biweekly['period_end'].dt.strftime('%Y-%m-%d')
)

# final dataframe with period first, then injury columns
final_df_OT_summary = biweekly[['period'] + injury_columns].copy()

# optional: sort by period (already in order because biweek_idx increases)
final_df_OT_summary = final_df_OT_summary.sort_values('period').reset_index(drop=True)

print(final_df_OT_summary)

out_path = "biweekly_injury_aggregates_OT.xlsx"
final_df_OT_summary.to_excel(out_path, index=False)

print("Saved aggregated file to:", out_path)


EGH

In [None]:
import pandas as pd

# --- 1) Load file ---------------------------------------------------------
file_path = "InjuryCategorization_Plastic surgery EGH - Coded 2 manipulated (2).xlsx"
df = pd.read_excel(file_path, engine='openpyxl')

# --- 2) Ensure date column is a datetime ----------------------------------
df['First Date of Surgery'] = pd.to_datetime(df['First Date of Surgery'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['First Date of Surgery']).copy()

# --- 3) Biweekly bucketing (14-day periods) -------------------------------
injury_columns = ['CMF', 'Burn', 'Limb', 'Soft Tissue', 'Wound Care', 'NTD']

# Anchor to earliest surgery date
start_date = df['First Date of Surgery'].min().normalize()

# Integer biweek index
df['biweek_idx'] = ((df['First Date of Surgery'] - start_date).dt.days // 14).astype(int)

# --- 4) Aggregate injury types by biweek ----------------------------------
biweekly_injury_counts = (
    df.groupby('biweek_idx')[injury_columns]
      .sum()
      .reset_index()
)

# Create readable date ranges
biweekly_injury_counts['period_start'] = start_date + pd.to_timedelta(
    biweekly_injury_counts['biweek_idx'] * 14, unit='D'
)
biweekly_injury_counts['period_end'] = (
    biweekly_injury_counts['period_start'] + pd.to_timedelta(13, unit='D')
)

biweekly_injury_counts['period'] = (
    biweekly_injury_counts['period_start'].dt.strftime('%Y-%m-%d')
    + ' to '
    + biweekly_injury_counts['period_end'].dt.strftime('%Y-%m-%d')
)

final_df_EGH_summary = biweekly_injury_counts[['period'] + injury_columns].copy()

# --- 5) Save output --------------------------------------------------------
out_path = "biweekly_injury_aggregates_EGH.xlsx"
final_df_EGH_summary.to_excel(out_path, index=False)

print("Saved aggregated file to:", out_path)
print(final_df_EGH_summary.head())


Shifa

In [None]:
import pandas as pd
import os
from pathlib import Path

# --- 1) Load file ---------------------------------------------------------
file_path = "plastic file Shifa 2_injurycategorization(Sheet1).csv"

if not Path(file_path).exists():
    raise FileNotFoundError(f"File not found: {file_path!r}")

# read csv (DO NOT pass openpyxl engine here)
df = pd.read_csv(file_path)

# --- 2) Ensure date column is a datetime ----------------------------------
date_col = 'date of admission'   # keep exactly as in CSV header

if date_col not in df.columns:
    raise KeyError(f"Date column {date_col!r} not found in file. Columns found: {list(df.columns)}")

df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=[date_col]).copy()

# --- 3) Biweekly bucketing (14-day periods) -------------------------------
injury_columns = ['CMF', 'Burn', 'Limb', 'Soft Tissue', 'Wound Care', 'NTD']

# Check which injury columns are present and warn about missing ones
missing = [c for c in injury_columns if c not in df.columns]
if missing:
    print(f"Warning: these injury columns are missing from the CSV and will be treated as zeros: {missing}")
    # add missing columns as zeros so aggregation still works
    for c in missing:
        df[c] = 0

# Convert injury columns to numeric (coerce non-numeric to zeros)
for c in injury_columns:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

# Anchor to earliest surgery/admission date
start_date = df[date_col].min().normalize()

# Integer biweek index (0-based)
df['biweek_idx'] = ((df[date_col] - start_date).dt.days // 14).astype(int)

# --- 4) Aggregate injury types by biweek ----------------------------------
biweekly_injury_counts = (
    df.groupby('biweek_idx')[injury_columns]
      .sum()
      .reset_index()
)

# Create readable date ranges
biweekly_injury_counts['period_start'] = start_date + pd.to_timedelta(
    biweekly_injury_counts['biweek_idx'] * 14, unit='D'
)
biweekly_injury_counts['period_end'] = (
    biweekly_injury_counts['period_start'] + pd.to_timedelta(13, unit='D')
)

biweekly_injury_counts['period'] = (
    biweekly_injury_counts['period_start'].dt.strftime('%Y-%m-%d')
    + ' to '
    + biweekly_injury_counts['period_end'].dt.strftime('%Y-%m-%d')
)

final_df_shifa_summary = biweekly_injury_counts[['period'] + injury_columns].copy()

# --- 5) Save output --------------------------------------------------------
out_path = "biweekly_injury_aggregates_shifa.xlsx"

# Ensure directory exists
os.makedirs(os.path.dirname(out_path), exist_ok=True)

final_df_shifa_summary.to_excel(out_path, index=False)
print("Saved aggregated file to:", out_path)
print(final_df_shifa_summary.head(20))


Aggregate Nasser, Shifa, EGH Injuries

In [None]:
import pandas as pd
from pathlib import Path

# ---------------- User-editable file paths --------------------------------
path1 = "biweekly_injury_aggregates_OT.xlsx"
path2 = "biweekly_injury_aggregates_EGH.xlsx"
path3 = "biweekly_injury_aggregates_shifa.xlsx"

out_combined = "combined_biweekly_injury_aggregates_no_prefix.xlsx"
# --------------------------------------------------------------------------


def load_period_table(path):
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")

    df = pd.read_excel(path).copy()

    # Normalize period_start / period_end from several possible formats
    if 'period_start' in df.columns:
        df['period_start'] = pd.to_datetime(df['period_start'])
        if 'period_end' not in df.columns:
            df['period_end'] = df['period_start'] + pd.Timedelta(days=13)
    elif 'period' in df.columns:
        # Expect "YYYY-MM-DD to YYYY-MM-DD"
        def parse_period(s):
            if pd.isna(s):
                return (pd.NaT, pd.NaT)
            s = str(s)
            if ' to ' in s:
                left, right = s.split(' to ', 1)
                return (pd.to_datetime(left), pd.to_datetime(right))
            # fallback: try single date
            d = pd.to_datetime(s, errors='coerce')
            if pd.isna(d):
                return (pd.NaT, pd.NaT)
            return (d, d + pd.Timedelta(days=13))

        parsed = df['period'].astype(str).apply(parse_period)
        df[['period_start', 'period_end']] = pd.DataFrame(parsed.tolist(), index=df.index)
    elif 'year_month' in df.columns:
        # fallback monthly -> treat as first-of-month as period_start
        df['period_start'] = pd.to_datetime(df['year_month'].astype(str) + '-01', errors='coerce')
        df['period_end'] = df['period_start'] + pd.Timedelta(days=13)
    else:
        raise KeyError(f"No recognizable period column in {path}. Found columns: {list(df.columns)}")

    # Create canonical period string
    df['period'] = df['period_start'].dt.strftime('%Y-%m-%d') + ' to ' + df['period_end'].dt.strftime('%Y-%m-%d')

    # Identify injury columns: numeric columns excluding period fields
    exclude = {'period', 'period_start', 'period_end', 'year_month', 'biweek_idx'}
    injury_cols = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
    # If none found, try common names
    if not injury_cols:
        possible = ['Burn', 'Limb', 'Soft Tissue', 'Wound Care', 'CMF', 'NTD']
        injury_cols = [c for c in possible if c in df.columns]

    if not injury_cols:
        raise ValueError(f"No injury-like numeric columns detected in {path}. Columns: {list(df.columns)}")

    # Ensure numeric
    for c in injury_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

    # Keep only period_start/end/period + injury cols for merging
    keep = ['period_start', 'period_end', 'period'] + injury_cols
    return df[keep], df  # normalized + original


# Load all three (normalized + original)
norm1, orig1 = load_period_table(path1)
norm2, orig2 = load_period_table(path2)
norm3, orig3 = load_period_table(path3)

# Determine the union of all injury columns
all_injuries = sorted(
    set(norm1.columns.tolist()[3:] + norm2.columns.tolist()[3:] + norm3.columns.tolist()[3:])
)

# Ensure each normalized DF has all injury columns (fill missing with zeros)
def ensure_cols(df, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = 0
    # keep ordering: period_start, period_end, period, then injuries
    return df[['period_start', 'period_end', 'period'] + cols].copy()

norm1 = ensure_cols(norm1, all_injuries)
norm2 = ensure_cols(norm2, all_injuries)
norm3 = ensure_cols(norm3, all_injuries)

# ---- IMPORTANT: do NOT attempt to merge the three wide tables directly (overlapping columns).
# Instead, concatenate them and groupby period to SUM counts across inputs.
concat = pd.concat([norm1, norm2, norm3], ignore_index=True)

# Group and sum numeric injury columns by the canonical period
agg = concat.groupby(['period_start', 'period_end', 'period'], as_index=False)[all_injuries].sum()

# Sort by period_start
agg = agg.sort_values('period_start').reset_index(drop=True)

# Save combined and originals to Excel
with pd.ExcelWriter(out_combined, engine='openpyxl') as writer:
    agg.to_excel(writer, sheet_name='Combined', index=False)
    orig1.to_excel(writer, sheet_name='Original_1', index=False)
    orig2.to_excel(writer, sheet_name='Original_2', index=False)
    orig3.to_excel(writer, sheet_name='Original_3', index=False)

print("Saved combined workbook to:", out_combined)
print(agg.head(20))
