<a href="https://colab.research.google.com/github/Edenshmuel/ICU_Nutrition_ML/blob/main/Pipeline_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**This notebook defines the preprocessing pipeline for both clustering and prediction models.
It includes transformations for numerical, categorical, and skewed features**

Importing Necessary Libraries

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

Log Transform + Scaling for skewed features

In [None]:
log_scaler_pipeline = Pipeline(steps=[("log_transform", FunctionTransformer(np.log1p, validate=True)),
    ("scaler", MinMaxScaler())])

Standard Scaling for non-skewed features

In [None]:
scaler_pipeline = Pipeline(steps=[("scaler", MinMaxScaler())])

One-Hot Encoding for categorical features

In [None]:
cat_transformer = Pipeline(steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))])

This function converts the "Disease" column, which contains multiple diseases as a comma-separated string, into a multi-hot encoded format—creating a separate binary column for each unique disease

In [None]:
# def multi_hot_encode_disease(df):
#     df = df.copy()
#     df["Disease"] = df["Disease"].astype(str).str.split(", ")
#     all_diseases = set([d for sublist in df["Disease"] for d in sublist])

#     for disease in all_diseases:
#         df[disease] = df["Disease"].apply(lambda x: 1 if disease in x else 0)

#     df = df.drop(columns=["Disease"])
#     return df

# disease_transformer = FunctionTransformer(multi_hot_encode_disease)

In [None]:
# במקום FunctionTransformer למחלות, נשתמש בטרנספורמר מותאם אישית:
class MultiHotDiseaseEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # אוספים את כל סוגי המחלות שקיימות בעמודה "Disease"
        self.all_diseases_ = set()
        for diseases_list in X["Disease"].astype(str).str.split(", "):
            self.all_diseases_.update(diseases_list)
        return self

    def transform(self, X):
        X = X.copy()
        # הופכים כל ערך בעמודה Disease לרשימת מחלות
        X["Disease"] = X["Disease"].astype(str).str.split(", ")

        # יוצרים עמודה בינארית לכל מחלה
        for disease in self.all_diseases_:
            X[disease] = X["Disease"].apply(lambda lst: 1 if disease in lst else 0)

        # מוחקים את עמודת Disease המקורית
        X.drop(columns=["Disease"], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        """
        מחזיר את שמות העמודות לאחר הקידוד הרב-ערכי.
        input_features הוא רשימת העמודות המקורית (כולל "Disease").
        """
        if input_features is None:
            input_features = []
        output_features = list(input_features)
        # מסירים את "Disease" אם הוא קיים
        if "Disease" in output_features:
            output_features.remove("Disease")
        # מוסיפים את שמות העמודות החדשות (כל המחלות)
        output_features.extend(sorted(self.all_diseases_))
        return np.array(output_features, dtype=object)

disease_transformer = MultiHotDiseaseEncoder()

This code transforms the categorical "Activity Level" column into numerical values, making it suitable for machine learning models

In [None]:
activity_mapping = {
    "Sedentary": 0,
    "Lightly Active": 1,
    "Moderately Active": 2,
    "Very Active": 3,
    "Extremely Active": 4
    }

In [None]:
def encode_activity_level(X):
    X = X.copy()
    X["Activity Level"] = X["Activity Level"].map(activity_mapping)
    return X

activity_transformer = FunctionTransformer(encode_activity_level)

This class is a custom scikit-learn transformer that calculates the Body Mass Index (BMI) based on weight and height

In [None]:
class BMICalculator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["BMI"] = X["Weight"] / (X["Height"] ** 2)
        return X

Final Preprocessing Pipeline

In [None]:
num_pipeline = Pipeline(steps=[
    ("bmi_calculator", BMICalculator()),
    ("log_scaled", log_scaler_pipeline),
    ("scaler", scaler_pipeline)])

In [None]:
def create_preprocessor(numerical_features, categorical_features, Multy_categorical_features, right_skewed_features=None):
    transformers = []

    if right_skewed_features:
        transformers.append(("log_scaled", log_scaler_pipeline, right_skewed_features))

    transformers.append(("activity", activity_transformer, ordinal_features))

    transformers.extend([
        ("num_pipeline", num_pipeline, numerical_features),
        ("cat", cat_transformer, categorical_features),
        ("disease", disease_transformer, ["Disease"])])

    preprocessor = ColumnTransformer(transformers=transformers)

    return preprocessor

In [None]:
def get_feature_names(preprocessor, input_features):
    feature_names = []

    for name, transformer, columns in preprocessor.transformers_:
        if transformer == "passthrough":
            if isinstance(columns[0], int):
                feature_names.extend([input_features[i] for i in columns])
            else:
                feature_names.extend(columns)

        elif isinstance(transformer, OneHotEncoder):
            ohe_feature_names = transformer.get_feature_names_out(columns)
            feature_names.extend(ohe_feature_names)

        elif isinstance(transformer, Pipeline):
            last_step = transformer.steps[-1][1]
            if hasattr(last_step, "get_feature_names_out"):
                try:
                    fn = last_step.get_feature_names_out(columns)
                    feature_names.extend(fn)
                except:
                    feature_names.extend(columns)
            else:
                feature_names.extend(columns)

        elif hasattr(transformer, "get_feature_names_out"):
            try:
                fn = transformer.get_feature_names_out(columns)
                feature_names.extend(fn)
            except:
                feature_names.extend(columns)
        else:
            feature_names.extend(columns)

    # ✅ **נוסיף את "BMI" במקום המתאים**
    if "BMI" not in feature_names:
        index = feature_names.index("Weight")  # BMI נוצר אחרי Weight
        feature_names.insert(index + 1, "BMI")  # להכניס אותו מיד אחרי Weight

    return feature_names
