# Data Pre-processing (pipeline)

Import libraries/packages + split data

In [None]:
import sys

sys.path.append("../")
from src.data_utils import get_feature_lists
from src.config import SEED, BASE_PATH
import pandas as pd
import warnings

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
import joblib
import numpy as np
from shutil import rmtree

In [None]:
import_df = pd.read_parquet(BASE_PATH / "data" / "raw" / "orn_clean.parquet")
outcome_df = import_df["ORN"]
X_df = import_df.drop(["ORN"], axis=1)
feature_lists = get_feature_lists(X_df)

## Pipeline

Classify features by data type

In [None]:
##Imported func from src
feature_lists = get_feature_lists(X_df)
binary_cols = feature_lists["binary_cols"]
numerical_cols = feature_lists["numerical_cols"]
nominal_cols = feature_lists["nominal_cols"]
ordinal_cols = feature_lists["ordinal_cols"]

In [None]:
def remove_prefix(df):
    X = df.copy()
    X.columns = X.columns.str.replace(r"^\w+__", "", regex=True)
    return X


num_pipeline = Pipeline(
    [
        (
            "imputer",
            IterativeImputer(
                random_state=SEED,
                sample_posterior=False,
                initial_strategy="median",
                max_iter=10,
            ),
        ),
        ("scaler", MinMaxScaler()),
    ]
)
# Categorical pipeline: OneHotEncode
nom_pipeline = Pipeline(
    [("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]
)

###ML Model Pipeline###
ml_pipeline = Pipeline(
    [
        (
            "preprocessor",
            ColumnTransformer(
                [
                    ("num", num_pipeline, numerical_cols),
                    ("nom", nom_pipeline, nominal_cols),
                ],
                remainder="passthrough",
            ),
        )
    ]
)
ml_pipeline.set_output(transform="pandas")

### Nomogram Pipeline ###
nomo_num_pipeline = Pipeline(
    [
        (
            "imputer",
            IterativeImputer(
                random_state=SEED,
                sample_posterior=False,
                initial_strategy="median",
                max_iter=10,
            ),
        )
    ]
)
nomo_pipeline = Pipeline(
    [
        (
            "preprocessor",
            ColumnTransformer(
                [
                    ("num", nomo_num_pipeline, numerical_cols),
                    ("nom", nom_pipeline, nominal_cols),
                ],
                remainder="passthrough",
            ),
        )
    ]
)
nomo_pipeline.set_output(transform="pandas")

In [None]:
def transform_export_data(
    X_train, y_train, preprocessor, data_path, X_val=None, y_val=None
):
    preprocessor.fit(X_train)
    feature_names = preprocessor.get_feature_names_out()

    X_train_transformed = np.array(preprocessor.transform(X_train))
    X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)
    X_train_transformed = remove_prefix(X_train_transformed)

    # X_val_transformed = np.array(preprocessor.transform(X_val))
    # X_val_transformed = pd.DataFrame(X_val_transformed, columns=feature_names)
    # X_val_transformed = remove_prefix(X_val_transformed)

    # Reset index
    X_train_transformed.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    # X_val_transformed.reset_index(drop=True, inplace=True)
    # y_val.reset_index(drop=True, inplace=True)

    for col in X_train_transformed.columns:
        try:
            X_train_transformed[col] = pd.to_numeric(X_train_transformed[col])
        except Exception as e:
            print(f"Column {col} failed: {e}")

    # for col in X_val_transformed.columns:
    #     try:
    #         X_val_transformed[col] = pd.to_numeric(X_val_transformed[col])
    #     except Exception as e:
    #         print(f"Column {col} failed: {e}")

    ### Save processed data ###
    if data_path:
        if data_path.exists():
            warnings.warn(f"Over-writing tabular data at path: {data_path}")
            rmtree(data_path)
        data_path.mkdir(exist_ok=False, parents=True)
        ## Save transformed data
        # Train
        X_train_transformed.to_parquet(data_path / "X_train.parquet")
        y_train.to_excel(data_path / "y_train.xlsx")
        # Val
        # X_val_transformed.to_parquet(data_path / "X_val.parquet")
        # y_val.to_excel(data_path / "y_val.xlsx")
        # Pipeline
        joblib.dump(preprocessor, data_path / "pipeline.joblib", compress=3)
    return {
        "X_train": X_train_transformed,
        "y_train": y_train,
        # "X_val": X_val_transformed,
        # "y_val": y_val,
    }

In [None]:
data_path = BASE_PATH / "data" / "processed"
##Get train/val set
# X_train, X_val, y_train, y_val = train_test_split(
#     X_df, outcome_df, test_size=0.15, random_state=SEED, stratify=outcome_df
# )

base_data = transform_export_data(
    X_train=X_df,
    y_train=outcome_df,
    # X_val=X_val,
    # y_val=y_val,
    preprocessor=ml_pipeline,
    data_path=data_path / "base",
)
nomo_data = transform_export_data(
    X_train=X_df,
    y_train=outcome_df,
    # X_val=X_val,
    # y_val=y_val,
    preprocessor=nomo_pipeline,
    data_path=data_path / "nomo",
)