In [15]:
import pandas as pd
import hashlib

# 🔹 File paths
raw_data_file = "../data/1_Raw_Encrypted_Data.xlsx"
audit_file = "../data/3_Encrypted_Audit_Sample_Cohort_149_Detailed_PCS.xlsx"
ventilator_file = "../data/Detailed_Ventilator_Data_1_30_06_25.csv"

intermediate_output = "patient_wise_combined_data_reordered.xlsx"
final_output = "patient_wise_combined_data_with_ventilator_info_filtered.xlsx"

# 🔹 Target UHID for ventilator info injection
target_uhid = "eda6e9fa-b9b5-4e8c-bbe7-69dd3777d622"

# 🔹 Define column utilities
def fix_uhid(df):
    if 'UHID' in df.columns:
        df['UHID'] = df['UHID'].astype(str).str.strip()
    return df

def clean_columns(df):
    df.columns = df.columns.str.strip().str.upper()
    return df

# 🔹 Load raw data from all sheets
raw_xls = pd.ExcelFile(raw_data_file, engine='openpyxl')
df_list = [clean_columns(fix_uhid(raw_xls.parse(sheet))) for sheet in raw_xls.sheet_names]

# 🔹 Load audit sample
audit_samples = pd.read_excel(audit_file, header=1, engine='openpyxl')
audit_samples = clean_columns(fix_uhid(audit_samples))

# 🔹 Load ventilator data
ventilator_data = pd.read_csv(ventilator_file, engine='python')
ventilator_data.columns = ventilator_data.columns.str.strip()
vent_row = ventilator_data.iloc[0].to_dict()
vent_cols = list(vent_row.keys())

# 🔹 Select patient UHIDs
unique_uhids = audit_samples['UHID'].dropna().unique()[:10]

# 🔹 Clinically preferred column order
ordered_columns = [
    'UHID', 'PATIENTNAME', 'SEX', 'GENDER', 'DATEOFBIRTH', 'AGEINYEARS',
    'DOA', 'ADMITDATE', 'ADMITTED_BED_NO', 'ADMITTED_WARD',
    'DISCHARGE_BED_NO', 'DOD', 'DISCHARGEDATE', 'DATEOFDEATH', 'TIMEOFDEATH',
    'ALOS_CALCULATED', 'PATIENT_STATUS', 'CASESPLIT', 'SPECIALITY', 'BILLDATE',
    'LABEL', 'PRINCIPALDIAGNOSIS', 'PRINCIPALDIAGNOSISPOA',
    'SECONDARYDIAGNOSIS', 'SECONDARYDIAGNOSISPOA',
    'DIAGNOSIS_DETAILS', 'DIAGNOSISNAME', 'COURSE_IN_HOSPITAL',
    'PARAMETERNAME', 'PARAMETERDETAILVALUE',
    'TOTAL_VENTILATION_HOURS', 'DAYSONMECHANICALVENTILATION', 'DMVDAYSAFTERADMISSION',
    'DISCHARGESTATUS', 'DISCHARGE_STATUS_CATEGORY_NAME', 'OPERATION_THEATRE_NOTES',
    'SURGICAL_SERVICE_NAME', 'SURGERY_NOTES_FREE_TEXT',
    'OTSTART DATETIME', 'OTEND DATETIME', 'OTDURATION',
    'PROCEDUREDATE', 'PROCEDURE (ALL)',
    'CODE', 'C1 SECTION', 'C2 BODY SYSTEM', 'C3 ROOTOP', 'C4 BODY PART', 'C5 APPROACH',
    'CODE.1', 'C1 SECTION.1', 'C2 BODY SYSTEM.1', 'C3 ROOTOP.1', 'C4 BODY PART.1', 'C5 APPROACH.1',
    'CODE.2', 'C1 SECTION.2', 'C2 BODY SYSTEM.2', 'C3 ROOTOP.2', 'C4 BODY PART.2', 'C5 APPROACH.2',
    'CODE.3', 'C1 SECTION.3', 'C2 BODY SYSTEM.3', 'C3 ROOTOP.3', 'C4 BODY PART.3', 'C5 APPROACH.3',
    'CODE.4', 'C1 SECTION.4', 'C2 BODY SYSTEM.4', 'C3 ROOTOP.4', 'C4 BODY PART.4', 'C5 APPROACH.4',
    'CODE.5', 'C1 SECTION.5', 'C2 BODY SYSTEM.5', 'C3 ROOTOP.5', 'C4 BODY PART.5', 'C5 APPROACH.5',
    'DOCTOR'
]
ordered_columns = [col.upper() for col in ordered_columns]

# 🔹 Write combined and reordered data
dataframes = {}
with pd.ExcelWriter(intermediate_output, engine='openpyxl') as writer:
    used_sheet_names = set()
    for uhid in unique_uhids:
        base = audit_samples[audit_samples['UHID'] == uhid].drop_duplicates(subset=['UHID']).copy()
        base.reset_index(drop=True, inplace=True)

        for i, df in enumerate(df_list):
            if 'UHID' in df.columns:
                sub = df[df['UHID'] == uhid].copy()
                if not sub.empty:
                    sub = sub.drop_duplicates(subset=['UHID']).reset_index(drop=True)
                    if i == 2:  # surgery sheet
                        for col in sub.columns:
                            if col != 'UHID' and col in base.columns:
                                sub.rename(columns={col: f"{col}_surgery"}, inplace=True)
                    else:
                        sub = sub.drop(columns=[col for col in sub.columns if col in base.columns and col != 'UHID'], errors='ignore')
                    base = pd.merge(base, sub, on='UHID', how='left')

        base = base.loc[:, ~base.columns.duplicated()]
        front = [col for col in ordered_columns if col in base.columns]
        rest = [col for col in base.columns if col not in front]
        base = base[front + rest]

        # Remove surgical duplicates
        base = base.drop(columns=[col for col in base.columns if col.endswith('_surgery') and col.split('_')[0] in ['DATEOFBIRTH', 'GENDER', 'DOA', 'DOD', 'DOCTOR', 'SPECIALITY', 'IPNO']], errors='ignore')

        # Save sheet name safely
        safe_uhid = str(uhid)[:10]
        hash_suffix = hashlib.md5(uhid.encode()).hexdigest()[:4]
        sheet_name = f"UHID_{safe_uhid}_{hash_suffix}"[:31]
        while sheet_name in used_sheet_names:
            hash_suffix = hashlib.md5((uhid + hash_suffix).encode()).hexdigest()[:4]
            sheet_name = f"UHID_{safe_uhid}_{hash_suffix}"[:31]
        used_sheet_names.add(sheet_name)

        # Save for later processing + write
        dataframes[sheet_name] = base
        base.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"✅ Intermediate file saved: {intermediate_output}")

# 🔹 Inject ventilator info and write final file
all_columns_set = set(vent_cols)
for df in dataframes.values():
    all_columns_set.update(df.columns)
all_columns = list(all_columns_set)

with pd.ExcelWriter(final_output, engine='openpyxl') as writer:
    for sheet, df in dataframes.items():
        if target_uhid in df['UHID'].astype(str).values:
            for col in vent_cols:
                df[col] = vent_row[col]
        else:
            for col in vent_cols:
                df[col] = pd.NA

        if 'TOTAL_VENTILATION_HOURS' in df.columns:
            reordered = []
            inserted = False
            for col in df.columns:
                if col == 'TOTAL_VENTILATION_HOURS' and not inserted:
                    reordered.extend([vc for vc in vent_cols if vc in df.columns])
                    reordered.append(col)
                    inserted = True
                elif col not in vent_cols:
                    reordered.append(col)
            df = df[reordered]

        for col in all_columns:
            if col not in df.columns:
                df[col] = pd.NA

        df = df[df.columns]
        df.to_excel(writer, sheet_name=sheet, index=False)

print(f"✅ Final Excel file saved with ventilator info: {final_output}")


✅ Intermediate file saved: patient_wise_combined_data_reordered.xlsx
✅ Final Excel file saved with ventilator info: patient_wise_combined_data_with_ventilator_info_filtered.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentat

In [17]:
import pandas as pd
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import Workbook

# 🔹 Paths
header_file = "../data/patient_data_final_structured.xlsx"  # to get the standardized 2-row headers
data_file = "patient_wise_combined_data_with_ventilator_info_filtered.xlsx"  # actual full data
output_file = "patient_data_final_structured_corrected.xlsx"

# 🔹 Load standardized header (2 rows only)
header_template = pd.read_excel(header_file, sheet_name=0, header=None, nrows=2)
basic_headers = header_template.iloc[0].fillna("").tolist()
detailed_headers = header_template.iloc[1].fillna("").tolist()

# 🔹 Read the actual data
data_xls = pd.ExcelFile(data_file, engine="openpyxl")

# 🔹 Create output writer
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    for sheet in data_xls.sheet_names:
        df = data_xls.parse(sheet)
        df.columns = df.columns.str.strip()

        # Reconstruct column MultiIndex
        unmatched_columns = []
        matched_columns = []        
        basic_row = []
        detailed_row = []

        for col in df.columns:
            if col in detailed_headers:
                idx = detailed_headers.index(col)
                basic_row.append(basic_headers[idx])
                detailed_row.append(detailed_headers[idx])
                matched_columns.append(col)
            else:
                basic_row.append("")
                detailed_row.append(col)
                unmatched_columns.append(col)

        df.columns = pd.MultiIndex.from_tuples(zip(basic_row, detailed_row))

        # Reorder to move 'IPNO' to second column if it exists
        if ('', 'IPNO') in df.columns or any(col[1] == 'IPNO' for col in df.columns):
            ipno_col = [col for col in df.columns if col[1] == 'IPNO']
            other_cols = [col for col in df.columns if col[1] != 'IPNO']
            if ipno_col:
                df = df[[other_cols[0]] + ipno_col + other_cols[1:]]


        # Write with 2-row header
        for r in dataframe_to_rows(df, index=False, header=True):
            if sheet not in writer.sheets:
                writer.book.create_sheet(sheet)
                writer.sheets[sheet] = writer.book[sheet]
            writer.sheets[sheet].append(r)

    # Remove default sheet if any
    if "Sheet" in writer.book.sheetnames:
        std = writer.book["Sheet"]
        writer.book.remove(std)

print(f"\n📝 Sheet: {sheet}")
print(f"✅ Matched Columns: {len(matched_columns)} / {len(df.columns)}")
if unmatched_columns:
    print("⚠️ Unmatched Columns:")
    for col in unmatched_columns:
        print(f"   - {col}")


FileNotFoundError: [Errno 2] No such file or directory: '../data/patient_data_final_structured.xlsx'

In [18]:
import pandas as pd
import os

# Input Excel with multiple sheets
input_file = "patient_data_final_structured_corrected.xlsx"
output_folder = "csv_output"

# Create folder to save CSVs
os.makedirs(output_folder, exist_ok=True)

# Read all sheets
xls = pd.ExcelFile(input_file, engine="openpyxl")
for sheet_name in xls.sheet_names:
    df = pd.read_excel(input_file, sheet_name=sheet_name, header=[0, 1])
    
    # Save to CSV
    output_csv = os.path.join(output_folder, f"{sheet_name}.csv")
    df.to_csv(output_csv, index=False)

print(f"✅ All sheets saved as CSV in folder: {output_folder}")


✅ All sheets saved as CSV in folder: csv_output


In [19]:
import pandas as pd



# Show the column names to check for typos
print(audit_file.columns)

# If the column is indeed named 'Total_Ventilation_Hours', filter it
# filtered_df = df[df['Total_Ventilation_Hours'].notnull()]

# # Extract the 'ipno' values
# ipnos_with_ventilation = filtered_df['IPNO'].unique().tolist()

# # Print the list of IP numbers
# print("IPNOs with ventilation data:")
# print(ipnos_with_ventilation)


AttributeError: 'str' object has no attribute 'columns'