# Data Cleaning

In [2]:
import pandas as pd

In [7]:
# load data
washington = pd.read_csv('Washington.csv')
oregon = pd.read_csv('Oregon.csv')

  oregon = pd.read_csv('Oregon.csv')


In [35]:
# convert to numeric data

# Get list of object dtype columns
object_cols = oregon.select_dtypes(include=['object']).columns.tolist()

for col in object_cols:
    # Attempt to convert column to numeric, coercing errors to NaN
    converted = pd.to_numeric(oregon[col].str.replace(',', '').str.replace('$', '').str.strip(), errors='coerce')
    
    # If at least one value successfully converted (non-NaN), replace the column
    if converted.notna().any():
        oregon[col] = converted
        print(f"Converted column '{col}' to numeric.")
    else:
        print(f"Skipped column '{col}' (no numeric values found).")


Skipped column 'Hospital Name' (no numeric values found).
Skipped column 'Hospital Short Name' (no numeric values found).
Skipped column 'Type' (no numeric values found).
Converted column 'Discharges of Acute Inpatient (Medicare)' to numeric.
Converted column 'Discharges of Acute Inpatient' to numeric.
Converted column 'Discharges of DPU (Medicare)' to numeric.
Converted column 'Discharges of DPU' to numeric.
Converted column 'Total Discharges (Medicare)' to numeric.
Converted column 'Total Discharges' to numeric.
Converted column 'Patient Days of Acute Inpatient (Medicare)' to numeric.
Converted column 'Patient Days of Acute Inpatient (Medicaid)' to numeric.
Converted column 'Patient Days of Acute Inpatient (Self Pay)' to numeric.
Converted column 'Patient Days of Acute Inpatient (Commercial)' to numeric.
Converted column 'Patient Days of Acute Inpatient (Others)' to numeric.
Converted column 'Patient Days of Acute Inpatient' to numeric.
Converted column 'Patient Days of DPU (Medicare

In [50]:
# Identify what type of aggregation makes the most sense

# view numerical columns
numeric_cols = oregon.select_dtypes(include='number').columns.tolist()
#list(numeric_cols)

mean_cols = ['Operating Margin', 'Total Margin ']

agg_dict = {col: 'sum' for col in numeric_cols if col not in ['Year', 'Quarter', 'AHA ID', 'Operating Margin', 'Total Margin ']}
agg_dict.update({col: 'mean' for col in mean_cols})

In [57]:
# Aggregate Oregon data
# Choose grouping keys: Hospital + Year + Quarter
group_keys = ['AHA ID', 'Year', 'Quarter']

# Select only numeric columns and groupby
numeric_cols = oregon.select_dtypes(include='number').columns.tolist()

# Group by hospital ID, name, year, quarter and sum numeric columns
oregon_df = oregon.groupby(group_keys).agg(agg_dict)
oregon_df = oregon_df.reset_index()
oregon_df.head()

Unnamed: 0,AHA ID,Year,Quarter,Month,Available Beds,Licensed Beds,Discharges of Acute Inpatient (Medicare),Discharges of Acute Inpatient (Medicaid),Discharges of Acute Inpatient (Self Pay),Discharges of Acute Inpatient (Commercial),...,Gross Patient Accounts Receivable (Self Pay),Gross Patient Accounts Receivable (Commercial),Gross Patient Accounts Receivable (Others),Gross Patient Accounts Receivable,Uncompensated Care,Inpatient Discharges,Total Operating Revenue,Total Revenue,Operating Margin,Total Margin
0,6920003,2007,1,6,1203,1662,771.0,1242,430,0.0,...,54907221,0.0,281437663,463051598,20221600,4663,98490357,100194315,-332586.0,235400.0
1,6920003,2007,2,15,1199,1662,751.0,1200,475,0.0,...,66583891,0.0,286899220,497094899,24867198,4767,112579479,114807909,2540018.0,3282828.0
2,6920003,2007,3,24,1197,1662,769.0,1097,516,0.0,...,72995940,0.0,312444180,525851915,27001412,4770,112507161,112327929,3199369.0,3139625.0
3,6920003,2007,4,33,1220,1662,794.0,1035,398,0.0,...,64945203,0.0,307649411,504706144,19238363,4508,107233076,107119832,1199891.0,1162143.0
4,6920003,2008,1,6,1234,1662,842.0,1284,381,0.0,...,50318750,0.0,295498881,481536188,16161666,4837,110440372,105645654,1237966.0,-360273.0


In [17]:
# dictionary to rename oregon columns to washingtion
rename_map = {
    # IDs and names
    'AHA ID': 'License_Number',
    'Hospital Name': 'Hospital_Name',
    # Location info (add 'City' if available in Oregon dataset or leave blank)
    # Date info (keep Year and Quarter)
    'Year': 'Year',
    'Quarter': 'Quarter',
    # Beds
    'Licensed Beds': 'Licensed_Beds',
    # Discharges: map Oregon discharge columns to Washington equivalents
    'Discharges of Acute Inpatient (Medicare)': 'Acute_Care_Medicare_Discharges',
    'Discharges of Acute Inpatient (Medicaid)': 'Acute_Care_Medicaid_Discharges',
    'Discharges of Acute Inpatient (Self Pay)': 'Acute_Care_Self_Pay_Discharges',
    'Discharges of Acute Inpatient (Commercial)': 'Acute_Care_Commercial_Discharges',
    'Discharges of Acute Inpatient (Others)': 'Acute_Care_Other_Discharges',
    'Discharges of Acute Inpatient': 'Acute_Care_Total_Discharges',

    'Discharges of Swing Bed (Medicare)': 'Swing_Bed_Medicare_Discharges1', 
    'Discharges of Swing Bed (Medicaid)': 'Swing_Bed_Medicaid_Discharges',
    'Discharges of Swing Bed (Self Pay)': 'Swing_Bed_Self_Pay_Discharges',
    'Discharges of Swing Bed (Commercial)': 'Swing_Bed_Commercial_Discharges',
    'Discharges of Swing Bed (Others)': 'Swing_Bed_Other_Discharges',
    'Discharges of Swing Bed': 'Swing_Bed_Total_Discharges',

    'Discharges of Subacute & LTC (Medicare)': 'SNF_Medicare_Discharges',
    'Discharges of Subacute & LTC (Medicaid)': 'SNF_Medicaid_Discharges',
    'Discharges of Subacute & LTC (Self Pay)': 'SNF_Self_Pay_Discharges',
    'Discharges of Subacute & LTC (Commercial)': 'SNF_Commercial_Discharges',
    'Discharges of Subacute & LTC (Others)': 'SNF_Other_Discharges',
    'Discharges of Subacute & LTC': 'SNF_Total_Discharges',

    'Discharges of DPU (Medicare)': 'Psych_Rehab_CDU_Medicare_Discharges',
    'Discharges of DPU (Medicaid)': 'Psych_Rehab_CDU_Medicaid_Discharges',
    'Discharges of DPU (Self Pay)': 'Psych_Rehab_CDU_Self_Pay_Discharges',
    'Discharges of DPU (Commercial)': 'Psych_Rehab_CDU_Commercial_Discharges',
    'Discharges of DPU (Others)': 'Psych_Rehab_CDU_Other_Discharges',
    'Discharges of DPU': 'Psych_Rehab_CDU_Total_Discharges',

    'Total Discharges (Medicare)': 'Total_Medicare_Discharges',
    'Total Discharges (Medicaid)': 'Total_Medicaid_Discharges',
    'Total Discharges (Self Pay)': 'Total_Self_Pay_Discharges',
    'Total Discharges (Commercial)': 'Total_Commercial_Discharges',
    'Total Discharges (Others)': 'Total_Other_Discharges',
    'Total Discharges': 'Total_Discharges',

    # Patient Days (similar mapping as above)
    'Patient Days of Acute Inpatient (Medicare)': 'Acute_Care_Medicare_Patient_Days',
    'Patient Days of Acute Inpatient (Medicaid)': 'Acute_Care_Medicaid_Patient_Days',
    'Patient Days of Acute Inpatient (Self Pay)': 'Acute_Care_Self_Pay_Patient_Days',
    'Patient Days of Acute Inpatient (Commercial)': 'Acute_Care_Commercial_Patient_Days',
    'Patient Days of Acute Inpatient (Others)': 'Acute_Care_Other_Patient_Days',
    'Patient Days of Acute Inpatient': 'Acute_Care_Total_Patient_Days',

    'Patient Days of Swing Bed (Medicare)': 'Swing_Bed_Medicare_Patient_Days',
    'Patient Days of Swing Bed (Medicaid)': 'Swing_Bed_Medicaid_Patient_Days',
    'Patient Days of Swing Bed (Self Pay)': 'Swing_Bed_Self_Pay_Patient_Days',
    'Patient Days of Swing Bed (Commercial)': 'Swing_Bed_Commercial_Patient_Days',
    'Patient Days of Swing Bed (Others)': 'Swing_Bed_Other_Patient_Days',
    'Patient Days of Swing Bed': 'Swing_Bed_Total_Patient_Days',

    'Patient Days of Subacute & LTC (Medicare)': 'SNF_Medicare_Patient_Days',
    'Patient Days of Subacute & LTC (Medicaid)': 'SNF_Medicaid_Patient_Days',
    'Patient Days of Subacute & LTC (Self Pay)': 'SNF_Self_Pay_Patient_Days',
    'Patient Days of Subacute & LTC (Commercial)': 'SNF_Commercial_Patient_Days',
    'Patient Days of Subacute & LTC (Others)': 'SNF_Other_Patient_Days',
    'Patient Days of Subacute & LTC': 'SNF_Total_Patient_Days',

    'Patient Days of DPU (Medicare)': 'Psych_Rehab_CDU_Medicare_Patient_Days',
    'Patient Days of DPU (Medicaid)': 'Psych_Rehab_CDU_Medicaid_Patient_Days',
    'Patient Days of DPU (Self Pay)': 'Psych_Rehab_CDU_Self_Pay_Patient_Days',
    'Patient Days of DPU (Commercial)': 'Psych_Rehab_CDU_Commercial_Patient_Days',
    'Patient Days of DPU (Others)': 'Psych_Rehab_CDU_Other_Patient_Days',
    'Patient Days of DPU': 'Psych_Rehab_CDU_Total_Patient_Days',

    'Total Patient Days (Medicare)': 'Total_Medicare_Patient_Days',
    'Total Patient Days (Medicaid)': 'Total_Medicaid_Patient_Days',
    'Total Patient Days (Self Pay)': 'Total_Self_Pay_Patient_Days',
    'Total Patient Days (Commercial)': 'Total_Commercial_Patient_Days',
    'Total Patient Days (Others)': 'Total_Other_Patient_Days',
    'Total Patient Days': 'Total_Patient_Days',

    # Surgeries, Births, Admissions, Visits
    'Inpatient Surgeries': 'Total_Inpatient_Surgeries',
    'Births': 'Total_Births',
    'Newborn Patient Days': 'Total_Newborn_Days',
    'Admissions from ED': 'Number_of_Admissions_from_ER',
    'Emergency Department Visits': 'Emergency_Room_Visits',
    'Ambulatory Surgery Visits': 'Outpatient_Surgery_Visits',
    'Observation Visits': 'Observation_Visits',
    'Home Health Visits': 'Home_Health_Visits',
    'Other Outpatient Visits': 'All_Other_Visits',
    'Total Outpatient Visits': 'Total_Outpatient_Visits',

    # Revenue and Charges (partial mapping)
    'Charges of Acute Inpatient (Medicare)': 'Acute_Medicare_Inpatient_Revenue',
    'Charges of Acute Inpatient (Medicaid)': 'Acute_Medicaid_Inpatient_Revenue',
    'Charges of Acute Inpatient (Self Pay)': 'Acute_Self_Pay_Inpatient_Revenue',
    'Charges of Acute Inpatient (Commercial)': 'Acute_Commercial_Inpatient_Revenue',
    'Charges of Acute Inpatient (Others)': 'Acute_Other_Inpatient_Revenue',

    'Charges of Acute Outpatient (Medicare)': 'Acute_Medicare_Outpatient_Revenue',
    'Charges of Acute Outpatient (Medicaid)': 'Acute_Medicaid_Outpatient_Revenue',
    'Charges of Acute Outpatient (Self Pay)': 'Acute_Self_Pay_Outpatient_Revenue',
    'Charges of Acute Outpatient (Commercial)': 'Acute_Commercial_Outpatient_Revenue',
    'Charges of Acute Outpatient (Others)': 'Acute_Other_Outpatient_Revenue',

    # ... add other revenue/expense/contractual columns similarly ...

    'Net Nonoperating Gains': 'Net_Non_Operating_Gains_Losses',
    'Tax Subsidies ': 'Tax_Subsidies', 
    'Total Margin ': 'Gross_Total_Margin', 

    'Gross Patient Accounts Receivable (Medicare)': 'Gross_Accounts_Receivable_Medicare',
    'Gross Patient Accounts Receivable (Medicaid)': 'Gross_Accounts_Receivable_Medicaid',
    'Gross Patient Accounts Receivable (Self Pay)': 'Gross_Accounts_Receivable_Self_Pay',
    'Gross Patient Accounts Receivable (Commercial)': 'Gross_Accounts_Receivable_Commercial',
    'Gross Patient Accounts Receivable (Others)': 'Gross_Accounts_Receivable_Other',
    'Gross Patient Accounts Receivable': 'Total_Gross_Accounts_Receivable',

    'Uncompensated Care': 'Uncompensated_Care',
    'Inpatient Discharges': 'Inpatient_Discharges',
    'Total Operating Revenue': 'Total_Operating_Revenue',
    'Total Revenue': 'Total_Revenue'
}


In [61]:
# Rename Oregon columns to match Washington
oregon_df_renamed = oregon_df.rename(columns=rename_map)

In [62]:
oregon_df_renamed.head()

Unnamed: 0,License_Number,Year,Quarter,Month,Available Beds,Licensed_Beds,Acute_Care_Medicare_Discharges,Acute_Care_Medicaid_Discharges,Acute_Care_Self_Pay_Discharges,Acute_Care_Commercial_Discharges,...,Gross_Accounts_Receivable_Self_Pay,Gross_Accounts_Receivable_Commercial,Gross_Accounts_Receivable_Other,Total_Gross_Accounts_Receivable,Uncompensated_Care,Inpatient_Discharges,Total_Operating_Revenue,Total_Revenue,Operating Margin,Gross_Total_Margin
0,6920003,2007,1,6,1203,1662,771.0,1242,430,0.0,...,54907221,0.0,281437663,463051598,20221600,4663,98490357,100194315,-332586.0,235400.0
1,6920003,2007,2,15,1199,1662,751.0,1200,475,0.0,...,66583891,0.0,286899220,497094899,24867198,4767,112579479,114807909,2540018.0,3282828.0
2,6920003,2007,3,24,1197,1662,769.0,1097,516,0.0,...,72995940,0.0,312444180,525851915,27001412,4770,112507161,112327929,3199369.0,3139625.0
3,6920003,2007,4,33,1220,1662,794.0,1035,398,0.0,...,64945203,0.0,307649411,504706144,19238363,4508,107233076,107119832,1199891.0,1162143.0
4,6920003,2008,1,6,1234,1662,842.0,1284,381,0.0,...,50318750,0.0,295498881,481536188,16161666,4837,110440372,105645654,1237966.0,-360273.0


In [63]:
washington.head()

Unnamed: 0,License_Number,Hospital_Name,City,Year,Quarter,Licensed_Beds,Acute_Care_Medicare_Discharges,Acute_Care_Medicaid_Discharges,Acute_Care_Self_Pay_Discharges,Acute_Care_Commercial_Discharges,...,Net_Non_Operating_Gains_Losses,Tax_Subsidies,Gross_Total_Margin,Gross_Accounts_Receivable_Medicare,Gross_Accounts_Receivable_Medicaid,Gross_Accounts_Receivable_Self_Pay,Gross_Accounts_Receivable_Commercial,Gross_Accounts_Receivable_Other,Total_Gross_Accounts_Receivable,Casemix Index
0,1,Swedish Health Services DBA Swedish Medical Cent,Seattle,2018,1,830,3402,1788,0,0,...,0,0,0,211354030,102569207,0,0,301795144,615718381,0.9097
1,3,Swedish Health Services DBA Swedish Medical Cente,Seattle,2018,1,385,1403,297,0,0,...,0,0,0,112111763,44548088,0,0,96354850,253014701,2.1411
2,8,Klickitat County Public Hospital District #1,Goldendale,2018,1,25,35,6,0,0,...,177302,352033,529335,1789954,1009468,0,0,4507006,7306428,0.6653
3,10,Virginia Mason Medical Center,Seattle,2018,1,371,1700,305,0,0,...,276206,0,276206,103212068,39126742,0,0,162775303,305114113,1.5373
4,14,Seattle Children's Hospital,Seattle,2018,1,402,26,1301,0,0,...,39027955,0,39027955,5979895,235055869,0,0,218956152,459991916,1.34
