In [1]:
import pandas as pd
import numpy as np

# Paths (run this notebook from /healthcare-analytics/notebooks)
CLEAN_PATH = "../data/hospital_readmissions.csv"
DIRTY_PATH = "../data/hospital_readmissions_dirty.csv"

df = pd.read_csv(CLEAN_PATH)
orig_len = len(df)
orig_cols = df.columns.tolist()

orig_len, orig_cols


(10000,
 ['Patient_ID',
  'Age',
  'Gender',
  'Primary_Language',
  'Diagnosis_Code',
  'Admission_Type',
  'Length_of_Stay',
  'Readmitted',
  'Cost_of_Stay',
  'Hospital_Region',
  'Insurance_Type',
  'Comorbidities_Count',
  'Admission_Date'])

In [2]:
rng = np.random.default_rng(7)
dirty = df.copy()

# 1. Missing values 
for col, p in {"Cost_of_Stay":0.03, "Insurance_Type":0.04,
               "Diagnosis_Code":0.02, "Primary_Language":0.015,
               "Admission_Date":0.01}.items():
    mask = rng.random(len(dirty)) < p
    dirty.loc[mask, col] = np.nan

# 2. Inconsistent casing / whitespace
def rand_idx(frac): return dirty.sample(frac=frac, random_state=rng.integers(0,1e6)).index
# Regions
dirty.loc[rand_idx(0.12), "Hospital_Region"] = dirty["Hospital_Region"].str.upper()
dirty.loc[rand_idx(0.08), "Hospital_Region"] = dirty["Hospital_Region"].str.lower()
dirty.loc[rand_idx(0.05), "Hospital_Region"] += "  "   # trailing spaces
# Insurance
dirty.loc[rand_idx(0.06), "Insurance_Type"] = dirty["Insurance_Type"].str.lower()
dirty.loc[rand_idx(0.03), "Insurance_Type"] = "  " + dirty["Insurance_Type"].astype(str)
# Language
dirty.loc[rand_idx(0.05), "Primary_Language"] = dirty["Primary_Language"].str.lower()
dirty.loc[rand_idx(0.03), "Primary_Language"] = dirty["Primary_Language"].astype(str) + " "

# 3. Minor typos 
typo_map = {"Medicare":"Meddicare", "Private":"Privatte", "South":"southh"}
idx = rand_idx(0.01)
dirty.loc[idx, "Insurance_Type"] = dirty.loc[idx, "Insurance_Type"].replace(typo_map)

# 4. Outliers + invalid numeric values
med_cost = dirty["Cost_of_Stay"].median()
dirty.loc[rand_idx(0.002), "Cost_of_Stay"] = med_cost * 25
dirty.loc[rand_idx(0.003), "Length_of_Stay"] = 0.0
dirty.loc[rand_idx(0.002), "Length_of_Stay"] = 100.0
dirty.loc[rand_idx(0.001), "Age"] = rng.choice([-5, 150], size=10)

# 5. Mixed date formats
ad = pd.to_datetime(dirty["Admission_Date"], errors="coerce")
fmt = rng.random(len(dirty))
dirty.loc[fmt < 0.35, "Admission_Date"] = ad.dt.strftime("%m/%d/%Y")
dirty.loc[(fmt>=0.35)&(fmt<0.8), "Admission_Date"] = ad.dt.strftime("%Y-%m-%d")
dirty.loc[(fmt>=0.8)&(fmt<0.98), "Admission_Date"] = ad.dt.strftime("%d/%m/%Y")
dirty.loc[fmt>=0.98, "Admission_Date"] = "2024/13/40"  # impossible date

# 6. Duplicate a small fraction of rows 
dupes = dirty.sample(frac=0.005, random_state=rng.integers(0,1e6))
dirty = pd.concat([dirty, dupes], ignore_index=True)

print("Final rows:", len(dirty))


Final rows: 10050


In [3]:
print("=== Imperfection Injection Summary ===")
print("Missing values per column:")
print(dirty.isna().sum())
print("\nDuplicate rows:", dirty.duplicated().sum())
print("\nInvalid ages:",
      ((dirty["Age"]<0) | (dirty["Age"]>120)).sum())
print("Extreme LOS:",
      ((dirty["Length_of_Stay"]<=0) | (dirty["Length_of_Stay"]>60)).sum())
print("Extreme costs:",
      (dirty["Cost_of_Stay"] > df["Cost_of_Stay"].quantile(0.99)*5).sum())


=== Imperfection Injection Summary ===
Missing values per column:
Patient_ID               0
Age                      0
Gender                   0
Primary_Language       148
Diagnosis_Code         195
Admission_Type           0
Length_of_Stay           0
Readmitted               0
Cost_of_Stay           295
Hospital_Region          0
Insurance_Type         364
Comorbidities_Count      0
Admission_Date         105
dtype: int64

Duplicate rows: 50

Invalid ages: 10
Extreme LOS: 50
Extreme costs: 0


In [4]:
dirty.to_csv(DIRTY_PATH, index=False)
print(f"Saved dirty dataset to {DIRTY_PATH}")


Saved dirty dataset to ../data/hospital_readmissions_dirty.csv
