# Data Pre-processing (pipeline)

Import libraries/packages + split data

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../'))
from src.custom_transformers import remove_prefix
from src.feature_lists import get_feature_lists
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import joblib
from src.config import SEED
raw_X_train, raw_y_train = pd.read_parquet('../data/raw/split/Raw_X_train.parquet'), pd.read_excel('../data/raw/split/Raw_y_train.xlsx', index_col=0)
raw_X_test, raw_y_test =  pd.read_parquet('../data/raw/split/Raw_X_test.parquet'), pd.read_excel('../data/raw/split/Raw_y_test.xlsx', index_col=0)

Classify features by data type

In [None]:
##Imported func from src
feature_lists = get_feature_lists()
binary_cols = feature_lists["binary_cols"]
numerical_cols = feature_lists["numerical_cols"]
nominal_cols = feature_lists["nominal_cols"]
ordinal_cols = feature_lists["ordinal_cols"]

In [None]:
raw_X_train[nominal_cols] = raw_X_train[nominal_cols].astype(float)
raw_X_test[nominal_cols] = raw_X_test[nominal_cols].astype(float)

In [None]:
num_pipeline = Pipeline([
    ('imputer', IterativeImputer(random_state=SEED, sample_posterior=True)),
    ('scaler', MinMaxScaler())
])

###ML Model Pipeline###
ml_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', num_pipeline, numerical_cols),
        ('cat', OneHotEncoder(sparse_output=False), nominal_cols) #drop one hot?
    ], remainder='passthrough')),
    ('prefix_remover', FunctionTransformer(remove_prefix, validate=False))
])
ml_pipeline.set_output(transform="pandas")

### Nomogram Pipeline ###
nomo_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', IterativeImputer(random_state=SEED, sample_posterior=True), numerical_cols),
        ('cat', OneHotEncoder(sparse_output=False), nominal_cols),
    ], remainder='passthrough')),
    ('prefix_remover', FunctionTransformer(remove_prefix, validate = False))
])
nomo_pipeline.set_output(transform="pandas")

In [None]:
###Fit on train data and transform both training and testing
ml_train_transformed = ml_pipeline.fit_transform(raw_X_train)
ml_test_transformed = ml_pipeline.transform(raw_X_test)

nomo_train_transformed = nomo_pipeline.fit_transform(raw_X_train)
nomo_test_transformed = nomo_pipeline.transform(raw_X_test)


#Export scaled/encoded dfs to Excel
ml_train_transformed.to_parquet('../data/processed/ml_train_transformed.parquet')
ml_test_transformed.to_parquet('../data/processed/ml_test_transformed.parquet')

nomo_train_transformed.to_parquet('../data/processed/nomo_train_transformed.parquet')
nomo_test_transformed.to_parquet('../data/processed/nomo_test_transformed.parquet')

#Save pipeline for future use
joblib.dump(ml_pipeline, '../data/processed/ml_preprocessing_pipeline.pkl')
joblib.dump(nomo_pipeline, '../data/processed/nomo_preprocessing_pipeline.pkl')