In [2]:
# === Cleaning Script for Dataset 1 (China Heart Attack Risk) ===
import pandas as pd
import numpy as np

# --- Load raw dataset (update path) ---
df = pd.read_csv("dataset1_raw.csv")

# --- Step 1: Standardise column names ---
df.columns = (
    df.columns.str.strip()
    .str.replace(" ", "_")
    .str.replace("-", "_")
    .str.replace("/", "_")
    .str.lower()
)

# --- Step 2: Handle missing values ---
# Numeric columns → fill with median
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns → fill with mode
cat_cols = df.select_dtypes(include=["object", "category"]).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# --- Step 3: Normalise categorical values ---
# Example: standardise Yes/No, Male/Female, etc.
def normalise_binary(x):
    if str(x).lower() in ["yes", "y", "1", "true"]:
        return 1
    elif str(x).lower() in ["no", "n", "0", "false"]:
        return 0
    return x

binary_vars = [
    "hypertension", "diabetes", "obesity", "family_history_cvd",
    "chronic_kidney_disease", "previous_heart_attack",
    "tcm_use", "heart_attack"
]
for col in binary_vars:
    if col in df.columns:
        df[col] = df[col].apply(normalise_binary).astype("int")

# Standardise gender
if "gender" in df.columns:
    df["gender"] = df["gender"].str.lower().replace({
        "m": "male", "f": "female"
    })

# --- Step 4: Ensure categorical dtype where needed ---
categorical_vars = [
    "gender", "smoking_status", "air_pollution_exposure",
    "physical_activity", "stress_level", "alcohol_consumption",
    "health_access", "rural_or_urban", "region", "province",
    "employment_status", "education_level", "income_level",
    "hospital_availability"
]
for col in categorical_vars:
    if col in df.columns:
        df[col] = df[col].astype("category")

# --- Step 5: Save cleaned dataset ---
df.to_csv("cleaned_dataset1.csv", index=False)

print("✅ Cleaning complete. Exported to cleaned_dataset1.csv")
print(df.head())


✅ Cleaning complete. Exported to cleaned_dataset1.csv
   patient_id  age  gender smoking_status  hypertension  diabetes  obesity  \
0           1   55    male     Non-Smoker             0         0        1   
1           2   66  female         Smoker             1         0        0   
2           3   69  female         Smoker             0         0        0   
3           4   45  female         Smoker             0         1        0   
4           5   39  female         Smoker             0         0        0   

  cholesterol_level air_pollution_exposure physical_activity  ...  \
0            Normal                   High              High  ...   
1               Low                 Medium              High  ...   
2               Low                 Medium              High  ...   
3            Normal                 Medium               Low  ...   
4            Normal                 Medium            Medium  ...   

  hospital_availability tcm_use employment_status  education_l