In [2]:
import pandas as pd

In [16]:
def read_data(data_path):
    "load patient surgical data"
    df = pd.read_excel(data_path)
    return df

def clean_data(df):
    "clean patient surgical data"

    #unify spelling
    df["UIV_implant"] = df["UIV_implant"].str.capitalize().replace({"Fs": "FS", "Ps": "PS"})
    df.loc[df["UIV_implant"].str.contains("Fenestrated", na=False), "UIV_implant"] = "FS"
    df.loc[df["UIV_implant"].str.contains("ether", na=False), "UIV_implant"] = "PS"
    df["sex"]= df["sex"].replace({"F": "FEMALE", "M": "MALE"})

    #create column with updated num_levels
    df["num_levels"] = pd.to_numeric(df["num_levels"], errors="coerce").astype("Int64")
    df.loc[df["num_levels"] >= 10, "num_levels_cat"] = "higher"
    df.loc[df["num_levels"] < 10, "num_levels_cat"] = "lower"

    #rename global_tilt column
    df.rename(columns={"global_tilt...18":"global_tilt"},inplace=True)

    return df

In [17]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]   # goes from notebooks/ → repo root
DATA_PATH = PROJECT_ROOT / "data" / "raw" / "MSDS_cleaned_with_CCI_ODI.xlsx"

df = read_data(DATA_PATH)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Columns: 106 entries, id to ODI_12mo
dtypes: float64(59), int64(20), object(27)
memory usage: 227.9+ KB


In [18]:
clean_df = clean_data(df)
#print(df.head())
print(clean_df[["num_levels", "num_levels_cat"]])

     num_levels num_levels_cat
0             9          lower
1            15         higher
2             9          lower
3            15         higher
4             9          lower
..          ...            ...
270          15         higher
271          15         higher
272          15         higher
273           9          lower
274          15         higher

[275 rows x 2 columns]


In [None]:
#future task- Implement automatic updates for new categories/features in future datasets

In [19]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]   # notebooks/ → repo root
processed_dir = PROJECT_ROOT / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

clean_df.to_csv(processed_dir / "cleaned_for_modeling.csv", index=False)
