In [1]:
import os
os.chdir('../')

In [2]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    feature_columns: list
    target_column: str
    n_jobs: int
    random_state: int

In [3]:
from src.MatchAnalysis.constants import *
from src.MatchAnalysis.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_training_config(self):

        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        training_data = self.config.prepare_data.prepared_data_path
        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir = Path(training.root_dir),
            trained_model_path = Path(training.trained_model_path),
            updated_base_model_path = Path(prepare_base_model.updated_base_model_path),
            training_data = Path(training_data),
            feature_columns = self.config.prepare_data.feature_columns,
            target_column = self.config.prepare_data.target_column,
            n_jobs = self.params.N_JOBS,
            random_state = self.params.VAL_SPLIT_RANDOM_STATE
        )
        
        return training_config

In [5]:
import os
import urllib.request as request
from zipfile import ZipFile
import sklearn
import pickle
import time

from sklearn.model_selection import train_test_split

In [6]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    def get_base_model(self):
        file = open(self.config.updated_base_model_path, 'rb')
        self.model = pickle.load(
            file
        )

    def get_data(self):
        self.training_data = pd.read_csv(self.config.training_data)
        
    def train_valid_data_split(self):

        self.X = self.training_data[self.config.feature_columns]
        self.y = self.training_data[self.config.target_column]
        
        self.train_X, self.valid_X, self.train_y, self.valid_y = train_test_split(
            self.X, self.y, test_size=0.2, random_state=self.config.random_state
        )

    def train(self):

        self.model.fit(self.train_X, self.train_y)

        self.save_model(
            self.config.trained_model_path,
            self.model
        )

    @staticmethod
    def save_model(path: Path, model):
        with open(path, 'wb') as f:
            pickle.dump(model, f)

In [7]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(training_config)
    training.get_base_model()
    training.get_data()
    training.train_valid_data_split()
    training.train()
except Exception as e:
    raise e

[2024-01-30 18:28:14,581: INFO: common] yaml file: config/config.yaml loaded successfully
[2024-01-30 18:28:14,583: INFO: common] yaml file: params.yaml loaded successfully
[2024-01-30 18:28:14,583: INFO: common] Creating directory: artifacts
[2024-01-30 18:28:14,584: INFO: common] Creating directory: artifacts/training
