In [None]:
import pandas as pd

In [None]:
def read_data(data_path):
    "load patient surgical data"
    df = pd.read_excel(data_path)
    return df

def clean_data(df):
    "clean patient surgical data"

    #unify spelling
    df["UIV_implant"] = df["UIV_implant"].str.capitalize().replace({"Fs": "FS", "Ps": "PS"})
    df.loc[df["UIV_implant"].str.contains("Fenestrated", na=False), "UIV_implant"] = "FS"
    df.loc[df["UIV_implant"].str.contains("ether", na=False), "UIV_implant"] = "PS"
    df["sex"]= df["sex"].replace({"F": "FEMALE", "M": "MALE"})

    #create column with updated num_levels
    df["num_levels"] = pd.to_numeric(df["num_levels"], errors="coerce").astype("Int64")
    df.loc[df["num_levels"] >= 10, "num_levels_cat"] = "higher"
    df.loc[df["num_levels"] < 10, "num_levels_cat"] = "lower"

    #rename global_tilt column
    df.rename(columns={"global_tilt...18":"global_tilt"},inplace=True)
    #rename smoking
    df.rename(columns={"SMOKING": "smoking"}, inplace=True)

    df["smoking"] = df["smoking"].map({
    "Never smoker": 0,
    "Former smoker": 1,
    "Current some day smoker": 1,
    "Current every day smoker": 1
})


    return df

In [None]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]   # goes from notebooks/ → repo root
DATA_PATH = PROJECT_ROOT / "data" / "raw" / "MSDS_cleaned_with_CCI_ODI.xlsx"

df = read_data(DATA_PATH)

df.info()

In [None]:
print(df["SMOKING"].value_counts(dropna=False))


In [None]:
clean_df = clean_data(df)
#print(df.head())
print(clean_df[["num_levels", "num_levels_cat"]])

In [None]:
print(df["smoking"].value_counts(dropna=False))


The SMOKING variable was originally recorded as a categorical field (Never smoker, Former smoker, Current some day smoker, Current every day smoker). For modeling purposes, it was converted into a binary predictor where 0 represents never smokers and 1 represents former or current smokers. Approximately 17% of observations had missing smoking values; to preserve sample size and maintain consistency in the mechanical failure model, missing values were imputed as non-smoker (0).

In [None]:
df["smoking"] = df["smoking"].fillna(0)


In [None]:
print(df["smoking"].value_counts())


In [None]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]   # notebooks/ → repo root
processed_dir = PROJECT_ROOT / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

clean_df.to_csv(processed_dir / "cleaned_for_modeling.csv", index=False)
