In [8]:
import pandas as pd

In [9]:
def read_data(data_path):
    "load patient surgical data"
    df = pd.read_excel(data_path)
    return df

def clean_data(df):
    "clean patient surgical data"

    #remove extra space or line breaks in column names
    df.columns = df.columns.astype(str).str.replace("\n"," ").str.replace(r"\s+"," ", regex=True).str.strip()

    #convert data type to integer for calculation of composite score
    df[["gap_score_preop", "gap_score_postop"]] = df[["gap_score_preop", "gap_score_postop"]]\
    .apply(pd.to_numeric, errors="coerce").astype("Int64")

    #unify spelling
    df["UIV_implant"] = df["UIV_implant"].str.capitalize().replace({"Fs": "FS", "Ps": "PS"})
    df.loc[df["UIV_implant"].str.contains("Fenestrated", na=False), "UIV_implant"] = "FS"
    df.loc[df["UIV_implant"].str.contains("ether", na=False), "UIV_implant"] = "PS"
    df["sex"]= df["sex"].replace({"F": "FEMALE", "M": "MALE"})

    #create column with updated num_levels
    df["num_levels"] = pd.to_numeric(df["num_levels"], errors="coerce").astype("Int64")
    df.loc[df["num_levels"] >= 10, "updated_num_levels"] = "higher"
    df.loc[df["num_levels"] < 10, "updated_num_levels"] = "lower"

    return df

In [10]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]   # goes from notebooks/ → repo root
DATA_PATH = PROJECT_ROOT / "data" / "raw" / "MSDS_cleaned_0122.xlsx"

df = read_data(DATA_PATH)

df.info()

<class 'pandas.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 98 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            277 non-null    int64  
 1   age                           277 non-null    int64  
 2   sex                           277 non-null    str    
 3   bmi                           250 non-null    float64
 4   Race                          276 non-null    str    
 5   LOS_min                       276 non-null    float64
 6   PROC_TIME_MIN                 276 non-null    float64
 7   SMOKING                       228 non-null    str    
 8   C7CSVL_preop                  274 non-null    float64
 9   SVA_preop                     275 non-null    float64
 10  TK_T4_T12_preop               275 non-null    float64
 11  TK_T10_L2_preop               275 non-null    float64
 12  T4PA_preop                    275 non-null    float64
 13  L1PA_preop      

In [11]:
clean_df = clean_data(df)
#print(df.head())
print(clean_df[["num_levels", "updated_num_levels"]])

     num_levels updated_num_levels
0             9              lower
1            15             higher
2             9              lower
3            15             higher
4             9              lower
..          ...                ...
272          15             higher
273          15             higher
274           9              lower
275        <NA>                NaN
276          15             higher

[277 rows x 2 columns]


In [12]:
#future task- Implement automatic updates for new categories/features in future datasets

In [15]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]   # notebooks/ → repo root
processed_dir = PROJECT_ROOT / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

df.to_csv(processed_dir / "cleaned_for_modeling.csv", index=False)
