In [13]:
import os

In [14]:
%pwd

'C:\\Users\\pc\\Desktop\\GitHub repos\\End-to-end-ML-project-with-MLflows'

In [15]:
os.chdir("C:/Users/pc/Desktop/GitHub repos/End-to-end-ML-project-with-MLflows")

In [16]:
%pwd

'C:\\Users\\pc\\Desktop\\GitHub repos\\End-to-end-ML-project-with-MLflows'

In [17]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    objective: str
    metric: str           
    boosting_type: str
    num_leaves: int
    learning_rate: float
    feature_fraction: float
    n_estimators: int
    target_column: str

In [18]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_dirs

In [19]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_dirs([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.LightGBM
        target =  self.schema.TARGET_COLUMN

        create_dirs([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            objective = params.objective,
            metric = params.metric ,           
            boosting_type = params.boosting_type,
            num_leaves = params.num_leaves,
            learning_rate = params.learning_rate,
            feature_fraction = params.feature_fraction,
            n_estimators = params.n_estimators,
            target_column = target.name
            
        )

        return model_trainer_config

In [20]:
cm = ConfigurationManager()
config = cm.get_model_trainer_config()

[2025-05-04 23:00:39,255: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-04 23:00:39,259: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-04 23:00:39,261: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-04 23:00:39,263: INFO: common: created directory at: artifacts]
[2025-05-04 23:00:39,265: INFO: common: created directory at: artifacts/model_trainer]


In [21]:
print(config)

ModelTrainerConfig(root_dir='artifacts/model_trainer', train_data_path='artifacts/data_transformation/train.csv', test_data_path='artifacts/data_transformation/test.csv', model_name='model.joblib', objective='regression_l1', metric='mae', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, feature_fraction=0.9, n_estimators=400, target_column='total_available')


In [26]:
import polars as pl
import pandas as pd
from mlProject import logger
import joblib
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

In [37]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # read the train and test datasets using polars since it's faster
        train_data = pl.read_csv(self.config.train_data_path)
        test_data = pl.read_csv(self.config.test_data_path)

        # convert to pandas dataframe since the model handles them better
        train_data = train_data.to_pandas() 
        test_data = test_data.to_pandas()
        print(train_data['date'].dtype)

        X_train = train_data.drop(['date', self.config.target_column], axis=1)
        X_test = test_data.drop(['date', self.config.target_column], axis=1)
        y_train = train_data[[self.config.target_column]]
        y_test = test_data[[self.config.target_column]]

        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

        # Create LightGBM Dataset objects
        # It automatically detects 'category' dtype columns
        lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) # Keep raw data if needed later
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)

        # Define model parameters
        params = {
            'objective': self.config.objective,  
            'metric': self.config.metric,              
            'boosting_type': self.config.boosting_type,
            'num_leaves': self.config.num_leaves,
            'learning_rate': self.config.learning_rate,
            'feature_fraction': self.config.feature_fraction,
            'random_state': 42,
            'verbose': -1,  # avoids surpressing training process messages
            'n_estimators': self.config.n_estimators,     
            'n_jobs': -1   # Use all available CPU cores
        }

        # Train the model
        lgbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1000, # Max rounds
                        valid_sets=[lgb_train, lgb_eval],
                        valid_names=['train', 'eval'],
                        callbacks=[lgb.early_stopping(10), lgb.log_evaluation(period=50)])
        
        return lgbm

In [38]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-05-04 23:29:11,807: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-04 23:29:11,811: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-04 23:29:11,815: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-04 23:29:11,817: INFO: common: created directory at: artifacts]
[2025-05-04 23:29:11,819: INFO: common: created directory at: artifacts/model_trainer]
object
X_train shape: (4043329, 26), y_train shape: (4043329, 1)
X_test shape: (212807, 26), y_test shape: (212807, 1)
Training until validation scores don't improve for 10 rounds
[50]	train's l1: 1.84848	eval's l1: 1.68921
[100]	train's l1: 1.47289	eval's l1: 1.48833
[150]	train's l1: 1.40354	eval's l1: 1.45945
[200]	train's l1: 1.38573	eval's l1: 1.45016
[250]	train's l1: 1.37613	eval's l1: 1.44441
[300]	train's l1: 1.37174	eval's l1: 1.4417
[350]	train's l1: 1.36661	eval's l1: 1.43933
[400]	train's l1: 1.36231	eval's l1: 1.43687
Did not meet early stopping. Best iteration