# Create Training Pipeline

### Processing and Encoding Pipeline

# Trining Pipelien
### processing

In [10]:
import pandas as pd
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler
from omegaconf import DictConfig

def dtype_conversion(X, cat_cols):
    X = X.copy()
    for col in cat_cols:
        if col in X.columns:
            # Fill NA and convert to category
            X[col] = X[col].fillna('missing').astype('category')
    return X
def read_process_data(
    cfg: DictConfig,
    logger
):
    """Data processing pipeline"""
    logger.info("Starting data processing")
    
    try:
        # 1. Load data
        df = pd.read_csv(os.path.join(cfg.raw_data_path, f'{cfg.file_name}.csv'))
        logger.info(f"Raw data loaded: {df.shape}")
        
        # 2. Validate initial data
        if df[cfg.target].isna().any():
            raise ValueError(f"Target column '{cfg.target}' contains missing values")
        
        # 3. Split data
        train_df, test_df = train_test_split(
            df, test_size=0.2, random_state=42, stratify=df[cfg.target]
        )
        logger.info(f"Train/Test split: {train_df.shape}/{test_df.shape}")

        # 4. Create processing pipeline
        cat_cols = cfg.features.categorical
        num_cols = cfg.features.numerical
        drop_cols = cfg.features.drop
        processing_pipeline = Pipeline([
            ('dtype_conversion', FunctionTransformer(
                func=dtype_conversion,
                kw_args={'cat_cols': cat_cols},
                validate=False
            )),
            
            ('numeric_imputer', MeanMedianImputer(
                imputation_method='median',
                variables=num_cols
            )),
            
            ('encoder', OneHotEncoder(
                drop_last=True,
                variables=cat_cols
            )),
            
            ('scaler', SklearnTransformerWrapper(
                transformer=StandardScaler(),
                variables=num_cols
            )),
            
            ('drop_features', DropFeatures(
                features_to_drop=drop_cols 
            ))
        ])

        # 5. Process data
        X_train = processing_pipeline.fit_transform(train_df)
        X_test = processing_pipeline.transform(test_df)

        # 6. Combine with target (critical fix)
        train_clean = pd.concat([
            X_train,
            train_df[cfg.target].rename(cfg.target)  # Preserve original index
        ], axis=1)
        
        test_clean = pd.concat([
            X_test,
            test_df[cfg.target].rename(cfg.target)  # Preserve original index
        ], axis=1)

        # 7. Validate output
        if len(train_clean) != len(train_df):
            raise ValueError("Row count mismatch in training data")
        if len(test_clean) != len(test_df):
            raise ValueError("Row count mismatch in test data")

        # 8. Save artifacts
        DESTINATION = cfg.processed_data_path
        file_name = cfg.file_name
        os.makedirs(DESTINATION, exist_ok=True)
        train_clean.to_parquet(os.path.join(DESTINATION, f"{file_name}-train.parquet"))
        test_clean.to_parquet(os.path.join(DESTINATION, f"{file_name}-test.parquet"))
        joblib.dump(processing_pipeline, os.path.join(DESTINATION, "pipeline.pkl"))

        logger.info(f"Processing complete. Final shapes: Train {train_clean.shape}, Test {test_clean.shape}")
    except Exception as e:
        logger.error(f"Processing failed: {str(e)}")
        raise

### Training and Tuning the Model

In [2]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.linear_model import LogisticRegression
import pickle
SOURCE = os.path.join("data", "processed")
MODEL_PATH = 'models'
def encode_target(file_name : str, 
                  target_col : str,
                  model_name : str,
                  logger):
    df_train = pd.read_parquet(os.path.join(DESTINATION, f"{file_name}-train.parquet"))
    df_test = pd.read_parquet(os.path.join(DESTINATION, f"{file_name}-test.parquet"))
    X_train , y_train = df_train.drop(columns=[target_col],axis=1) , df_train[target_col]    
    X_test ,y_test = df_test.drop(columns=[target_col],axis=1) , df_test[target_col]

    logger.info("Fitting the encoder/decoder of target variable")
    logger.info(f"Number of classes: {len(y_train.unique())}")
    """Create and fit encoder/decoder for target variable"""
    encoder = LabelEncoder()
    encoder.fit(y_train)
    # Create decoder mapping
    classes = encoder.classes_
    decoder = {i: cls for i, cls in enumerate(classes)}
    target_translator = {
        'encoder': encoder,
        'decoder': decoder,
    }
    logger.info("encoder/decoder of target created successfully")
    # Save the artifacts
    
    if not os.path.exists(os.path.join(MODEL_PATH, model_name)):
        os.makedirs(os.path.join(MODEL_PATH, model_name))
    with open(
        os.path.join(MODEL_PATH, model_name, "model_target_translator.pkl"),
        "wb",
    ) as pkl:
        pickle.dump(target_translator, pkl)
    logger.info("encoder/decoder of target saved")
    return X_train, y_train, X_test, y_test


In [3]:
from functools import partial
import os
import pickle
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

N_FOLDS = 3
MAX_EVALS = 3

# Updated search space with compatible parameters
SPACE = {
    "penalty": hp.choice("penalty", ["l1", "l2", "elasticnet"]),
    "C": hp.loguniform("C", -4, 4),
    "solver": hp.choice("solver", ["saga"]),  # Saga supports all penalties
    "l1_ratio": hp.uniform("l1_ratio", 0, 1)  # Required for elasticnet
}

def objective(params, X, y, n_folds: int = N_FOLDS):
    """Wrapper function for hyperparameter optimization"""
    try:
        # Handle elasticnet specific parameters
        if params["penalty"] == "elasticnet":
            params["l1_ratio"] = params.get("l1_ratio", 0.5)
        else:
            params.pop("l1_ratio", None)
            
        model = LogisticRegression(**params, max_iter=1000)
        scores = cross_validate(
            model, X, y, 
            cv=n_folds, 
            scoring="accuracy",
            error_score="raise"  # Get detailed errors
        )
        return {
            "loss": -np.mean(scores["test_score"]),  # Minimize negative accuracy
            "params": params,
            "status": STATUS_OK
        }
    except Exception as e:
        return {
            "loss": 0,
            "status": STATUS_FAIL,
            "exception": str(e)
        }

def train_model(X, y, model_name: str, logger):
    """Complete training pipeline with error handling"""
    logger.info("Loading target encoder/decoder")
    try:
        with open(os.path.join(MODEL_PATH, model_name, "model_target_translator.pkl"), "rb") as pkl:
            translator = pickle.load(pkl)
        
        y_train_enc = translator['encoder'].transform(y)
        
        logger.info("Starting hyperparameter optimization")
        bayes_trials = Trials()
        
        best = fmin(
            fn=partial(objective, X=X, y=y_train_enc),
            space=SPACE,
            algo=tpe.suggest,
            max_evals=MAX_EVALS,
            trials=bayes_trials,
            show_progressbar=False
        )
        
        # Get best parameters from trials
        best_params = bayes_trials.best_trial["result"]["params"]
        logger.info(f"Best parameters: {best_params}")
        
        # Train final model
        final_model = LogisticRegression(**best_params, max_iter=1000)
        final_model.fit(X, y_train_enc)
        
        # Save artifacts
        os.makedirs(os.path.join(MODEL_PATH, model_name), exist_ok=True)
        with open(os.path.join(MODEL_PATH, model_name, "final_model.pkl"), "wb") as pkl:
            pickle.dump(final_model, pkl)
            
        logger.info("Model trained and saved successfully")
        
        
    except Exception as e:
        logger.error(f"Training failed: {str(e)}")
        raise




### Evaluation Report

In [4]:
import json
import os
import pickle
from sklearn.metrics import classification_report
from skore import EstimatorReport

MODEL_PATH = "models"
REPORT_PATH = "reports"
def evaluate(X_test, y_test, model_name: str, logger):
    """Proper evaluation function with correct encoding"""
    logger.info("Starting model evaluation")
    
    try:
        # Load artifacts
        with open(os.path.join(MODEL_PATH, model_name, "model_target_translator.pkl"), "rb") as pkl:
            translator = pickle.load(pkl)
            
        with open(os.path.join(MODEL_PATH, model_name, "final_model.pkl"), "rb") as pkl:
            model = pickle.load(pkl)
        
        # Encode test labels
        y_test_enc = translator['encoder'].transform(y_test)
        
        # Generate predictions
        y_pred = model.predict(X_test)
        
        # Convert numeric class labels to strings
        class_names = [str(v) for v in translator['decoder'].values()]
        
        # Generate classification report
        evaluation_report = classification_report(
            y_test_enc,
            y_pred,
            target_names=class_names  # Use string labels
        )
        
        logger.info("saving evaluation report")
        if not os.path.exists(os.path.join(REPORT_PATH, model_name)):
            os.makedirs(os.path.join(REPORT_PATH, model_name))
        with open(
            os.path.join(REPORT_PATH, model_name, "evaluation_report.json"), "w"
        ) as js:
            json.dump(evaluation_report, js, indent=4)
        logger.info(f"Evaluation Report:\n{evaluation_report}")

    except Exception as e:
        logger.error(f"Evaluation failed: {str(e)}")
        raise

### Trainer.py

In [None]:
from src.logger import ExecutorLogger
from src.training.process_data import read_process_data
from src.training.evaluate import evaluate
from src.training.train import train_model,encode_target

import hydra
from omegaconf import DictConfig, OmegaConf
@hydra.main(config_path="conf", config_name="config")
def train_pipeline(cfg: DictConfig):
    logger = ExecutorLogger('train pipeline')
    logger.info("Training Started...")
    logger.info("Pipeline Parameters: \n" f"{OmegaConf.to_yaml(cfg)}")
    read_process_data(cfg.pipeline.data, logger)
    X_train, y_train, X_test, y_test = encode_target(file_name ='titanic', 
                  target_col = 'Survived',
                  model_name = 'basemodel',
                  logger = logger)
    # train_model(X = X_train,
    #         y = y_train,
    #         model_name = 'basemodel',
    #         logger = logger)
    # evaluate(X_test, y_test, "basemodel", logger)
    logger.info("Training Completed...")
    

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path="conf", config_name="config")


In [12]:
train_pipeline()

usage: ipykernel_launcher.py [--help] [--hydra-help] [--version]
                             [--cfg {job,hydra,all}] [--resolve]
                             [--package PACKAGE] [--run] [--multirun]
                             [--shell-completion] [--config-path CONFIG_PATH]
                             [--config-name CONFIG_NAME]
                             [--config-dir CONFIG_DIR]
                             [--experimental-rerun EXPERIMENTAL_RERUN]
                             [--info [{all,config,defaults,defaults-tree,plugins,searchpath}]]
                             [overrides ...]
ipykernel_launcher.py: error: unrecognized arguments: --f=/run/user/1000/jupyter/runtime/kernel-v3ef6c8661eb7d03333c75ec79f240d091aa2da93f.json


SystemExit: 2