In [2]:
import os
from pathlib import Path

In [3]:
%pwd

'c:\\Users\\Hp\\Videos\\classification implementation - machine learning with MLFlow\\research'

In [4]:
os.chdir('../')

In [5]:
%pwd

'c:\\Users\\Hp\\Videos\\classification implementation - machine learning with MLFlow'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    learning_rate: float
    n_estimators: int
    max_depth: int
    subsample: float
    colsample_bytree: float
    gamma: float
    reg_alpha: float
    reg_lambda: float
    min_child_weight: int
    eval_metric: str
    early_stopping_rounds: int
    tree_method: str
    scale_pos_weight: int
    objective: str
    target_column: str

In [7]:
## create configuration manager
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

[2023-09-27 11:58:49,265: INFO: utils: NumExpr defaulting to 8 threads.]


In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBClassifier
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            learning_rate=params.learning_rate,
            n_estimators=params.n_estimators,
            max_depth=params.max_depth,
            subsample=params.subsample,
            colsample_bytree=params.colsample_bytree,
            gamma=params.gamma,
            reg_alpha=params.reg_alpha,
            reg_lambda=params.reg_lambda,
            min_child_weight=params.min_child_weight,
            eval_metric=params.eval_metric,
            early_stopping_rounds=params.early_stopping_rounds,
            tree_method=params.tree_method,
            scale_pos_weight = params.scale_pos_weight,
            objective=params.objective,
            target_column=schema.name
        )

        return model_trainer_config

In [9]:
import pandas as pd
import os
from mlProject import logger
from xgboost import XGBClassifier
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from mlProject.utils.common import feature_processor

In [10]:
class ModelTrainer:
    def __init__(self,config:ModelTrainerConfig):
        self.config = config

    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        train_x = train_data.drop([self.config.target_column],axis=1)
        test_x = test_data.drop([self.config.target_column],axis=1)
        train_y = train_data[[self.config.target_column]]
        test_y = test_data[[self.config.target_column]]

        preprocessor = feature_processor()
        # train_x_processed = preprocessor.fit_transform(train_x)
        
        xgb = XGBClassifier(learning_rate = self.config.learning_rate,
                            n_estimators = self.config.n_estimators,
                            max_depth = self.config.max_depth,
                            subsample = self.config.subsample,
                            colsample_bytree = self.config.colsample_bytree,
                            gamma = self.config.gamma,
                            reg_alpha = self.config.reg_alpha,
                            reg_lambda = self.config.reg_lambda,
                            min_child_weight = self.config.min_child_weight,
                            eval_metric = self.config.eval_metric,
                            #eval_stopping_rounds = self.config.early_stopping_rounds,
                            tree_method = self.config.tree_method,
                            scale_pos_weight = self.config.scale_pos_weight,
                            objective = self.config.objective,
                            random_state = 42)
        
        model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('model', xgb)
                                         ])

        model_pipeline.fit(train_x,train_y)

        joblib.dump(xgb,os.path.join(self.config.root_dir, self.config.model_name))


In [11]:
# testing the model trainer
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e

[2023-09-27 11:59:08,424: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-27 11:59:08,434: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-27 11:59:08,439: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-09-27 11:59:08,439: INFO: common: created directory at: artifacts]
[2023-09-27 11:59:08,444: INFO: common: created directory at: artifacts/model_trainer]
