In [1]:
"""

AUTHORS: ANGELA EDITH SILES

"""

# Imports
import logging
import os
import optuna
from optuna.samplers import TPESampler
import xgboost as xgboost_regressor
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def preprocess_dataframe(pandas_df: pd.DataFrame):
    """Preprocess the dataframe obtained after applying feature engineering.
    
    Args:
        pandas_df (pd.DataFrame): DataFrame obtained after loading.
    
    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    logger.info('Preprocessing data: Item_MRP')
    dataset = pandas_df.drop(columns=['Item_Identifier', 'Outlet_Identifier'])

    # Split the dataset into train and test sets
    df_train = dataset.loc[pandas_df['Set'] == 'train']
    df_test = dataset.loc[pandas_df['Set'] == 'test']

    return df_train, df_test

class TuningHyperParametersPipeline(object):
    """Pipeline for tuning hyperparameters using Optuna and XGBoost."""
    
    def __init__(self, input_path, output_path: str = None):
        self.input_path = input_path
        self.output_path = output_path

    def load_data(self) -> pd.DataFrame:
        """Load the dataframe for processing.
        
        Returns:
            pd.DataFrame: Loaded DataFrame.
        """
        try:
            train_file = 'features.csv'
            train_data = os.path.join(self.input_path, train_file)
            pandas_df = pd.read_csv(train_data)
            logger.info("Loading data from: %s", self.input_path)
        except (FileNotFoundError, PermissionError, OSError) as error_load_file:
            logger.exception("An error occurred while loading data: %s", error_load_file)

        return pandas_df

    def prepare_data_for_training(self, df: pd.DataFrame):
        """Prepare the data for training.
        
        Args:
            df (pd.DataFrame): Dataframe to be trained.
        
        Returns:
            x_train, y_train (pd.DataFrame, pd.Series): Datasets obtained after applying the machine learning model.
        """

        global x_train
        global y_train

        df_train, df_test = preprocess_dataframe(pandas_df=df)

        # Delete columns without data
        df_train.drop(['Set'], axis=1, inplace=True)
        df_test.drop(['Item_Outlet_Sales', 'Set'],
                     axis=1, inplace=True)

        seed = 28

        # Split the dataset into training and validation sets
        X = df_train.drop(columns='Item_Outlet_Sales')
        logger.info('Data prepared for training: X')

        y = df_train['Item_Outlet_Sales']

        x_train, _, y_train, _ = train_test_split(
            X, y, test_size=0.3, random_state=seed)
        return x_train, y_train

    def train_model(self, x_train, y_train):
        """Train the model and evaluate its performance.
        
        Args:
            x_train (np.array): Features to be trained.
            y_train (np.array): Target to be trained.
        """
        seed = 28
        model_trained = xgboost_regressor.XGBRegressor(
            objective='reg:linear', n_estimators=10, random_state=seed)
        try:
            # Train the model
            score_model = cross_val_score(
                model_trained, x_train, y_train, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=10)
            logger.info('Model trained successfully')
            logger.info('Score model: %s', score_model)
            logger.info('Mean: %s, Std: %s', np.mean(score_model), np.std(score_model))
            return score_model
        except Exception as e:
            logger.exception("An error occurred while training the model: %s", str(e))

    def evaluate_score(self, param):
        """Return the score after applying xgboost regressor.
        
        Args:
            param (_type_): Variable used as a parameter to pass to the XGB Regressor function.
        
        Returns:
            float: Metric obtained after model training.
        """
        model = xgboost_regressor.XGBRegressor(**param)
        try:
            root_mean_square_error = np.mean(cross_val_score(
                model, x_train, y_train, cv=4, n_jobs=-1, scoring='neg_root_mean_squared_error'))
            return root_mean_square_error
        except Exception as e:
            logger.exception("An error occurred while returning the score: %s", str(e))

    def objective_function(self, trial):
        """Optuna objective function.
        
        Args:
            trial (_type_): Description.
        """
        # Define hyperparameters
        param = {'sampling_method': 'gradient_based', 'reg_lambda':
                 trial.suggest_uniform('lambda', 7.0, 17.0), 'reg_alpha': trial.suggest_uniform('alpha', 7.0, 17.0), 'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.5),
                 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 0.9),
                 'n_estimators': trial.suggest_int('n_estimators', 0, 100)}
        return self.evaluate_score(param)

    def run(self):
        """Run the entire pipeline."""
        try:
            data_frame = self.load_data()
            x_trained, y_trained = self.prepare_data_for_training(data_frame)
            model_trained = self.train_model(x_train=x_trained, y_train=y_trained)
            study_object = optuna.create_study(
                direction='minimize', sampler=TPESampler())
            study_object.optimize(self.objective_function, n_trials=200)

            fig = optuna.visualization.plot_parallel_coordinate(study_object)
            fig.show()

            logger.info('Best parameters: %s', study_object.best_params)
        except Exception as e:
            logger.exception("An error occurred in the pipeline: %s", str(e))

if __name__ == "__main__":
    try:
        pipeline = TuningHyperParametersPipeline(
            input_path='../data/',
            output_path='')
        pipeline.run()
    except Exception as e:
        logger.exception("An error occurred: %s", str(e))


INFO:__main__:Loading data from: ../data/
INFO:__main__:Preprocessing data: Item_MRP
INFO:__main__:Data prepared for training: X
INFO:__main__:Model trained successfully
INFO:__main__:Score model: [-1174.45481642 -1163.40561215 -1035.87472539 -1129.76640339
 -1066.90335567 -1177.79769901 -1218.15909424 -1189.71950574
 -1080.64951487 -1176.27624407]
INFO:__main__:Mean: -1141.3006970933468, Std: 57.32010374259351
[32m[I 2023-08-29 05:05:17,829][0m A new study created in memory with name: no-name-1b7d40a0-3ee8-4248-bc77-5e3e1132e9cc[0m
[32m[I 2023-08-29 05:05:18,123][0m Trial 0 finished with value: -1156.8655934588846 and parameters: {'lambda': 13.118277116054756, 'alpha': 7.16945787487016, 'learning_rate': 0.28278279341364504, 'colsample_bytree': 0.692715975829107, 'n_estimators': 60}. Best is trial 0 with value: -1156.8655934588846.[0m
[32m[I 2023-08-29 05:05:18,327][0m Trial 1 finished with value: -1157.94955758038 and parameters: {'lambda': 16.71095031214162, 'alpha': 16.45853

INFO:__main__:Best parameters: {'lambda': 10.909353495340472, 'alpha': 9.992948229922746, 'learning_rate': 0.38035877936121876, 'colsample_bytree': 0.4192615975158712, 'n_estimators': 0}
