In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import yaml
import joblib

In [2]:
with open(r"C:\Users\bhuva\Desktop\Alziemer\config.yaml", "r") as f:
    config = yaml.safe_load(f)
DATASET_PATH = config["data_path"]
TARGET_VARIABLE = config["target_variable"]
CATEGORICAL_FEATURES = config["categorical_features"] 
NUMERICAL_FEATURES = config["numerical_features"]     
TEST_SIZE = config["test_size"]
RANDOM_STATE = config["random_state"]
PROCESSED_DATA_DIR = config["processed_data_dir"] 
MODEL_DIR = config["model_dir"] 

In [3]:
def preprocess_and_split_data(data_path, target_column, categorical_cols, numerical_cols, test_size, random_state, processed_data_dir, model_dir):
    df = pd.read_csv(data_path)

    #target variable encoding
    df[target_column] = df[target_column].map({'Yes': 1, 'No': 0})

    #Preprocessing Pipeline 
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols), # Scale transformer for numerical features
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols) # One-hot encoding categorical data
        ],
        remainder='passthrough' 
    )

    X = df.drop(columns=[target_column])
    y = df[target_column]

    #Split Data 
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) # Initial train/temp split
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state, stratify=y_temp) # Split temp into val/test

    #Fit and Transform Preprocessor on training data 
    X_train_processed = preprocessor.fit_transform(X_train) # Fit on train, transform train
    X_val_processed = preprocessor.transform(X_val)       # Transform val using fitted preprocessor
    X_test_processed = preprocessor.transform(X_test)      # Transform test using fitted preprocessor

    #feature names after one-hot encoding 
    categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    all_feature_names = numerical_cols + categorical_feature_names.tolist()  
      
    #Converting processed data back to DataFrames 
    X_train_processed_df = pd.DataFrame(X_train_processed, columns=all_feature_names, index=X_train.index) # Keep original index if needed
    X_val_processed_df = pd.DataFrame(X_val_processed, columns=all_feature_names, index=X_val.index)
    X_test_processed_df = pd.DataFrame(X_test_processed, columns=all_feature_names, index=X_test.index)

   
    return X_train_processed_df, X_val_processed_df, X_test_processed_df, y_train, y_val, y_test, preprocessor, all_feature_names


In [4]:
if __name__ == "__main__":
    X_train_processed, X_val_processed, X_test_processed, y_train, y_val, y_test, preprocessor_obj, feature_names = preprocess_and_split_data(
        DATASET_PATH, TARGET_VARIABLE, CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TEST_SIZE, RANDOM_STATE, PROCESSED_DATA_DIR, MODEL_DIR
    )

    # Save processed data and preprocessor object Artifact Management
    X_train_processed.to_csv(f"{PROCESSED_DATA_DIR}/X_train_processed.csv", index=False)
    X_val_processed.to_csv(f"{PROCESSED_DATA_DIR}/X_val_processed.csv", index=False)
    X_test_processed.to_csv(f"{PROCESSED_DATA_DIR}/X_test_processed.csv", index=False)
    y_train.to_csv(f"{PROCESSED_DATA_DIR}/y_train.csv", index=False)
    y_val.to_csv(f"{PROCESSED_DATA_DIR}/y_val.csv", index=False)
    y_test.to_csv(f"{PROCESSED_DATA_DIR}/y_test.csv", index=False)

    joblib.dump(preprocessor_obj, f"{MODEL_DIR}/preprocessor.joblib") # Save preprocessor for deployment
    joblib.dump(feature_names, f"{MODEL_DIR}/feature_names.joblib") # Save feature names for later use
    print("Data preprocessing and split completed. Processed data and preprocessor saved to 'data/processed/' and 'models/' directories.")

Data preprocessing and split completed. Processed data and preprocessor saved to 'data/processed/' and 'models/' directories.
