In [1]:
import os

In [2]:
%pwd

'e:\\Amazon-Data-Science-Book\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\Amazon-Data-Science-Book'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentation: bool
    params_image_size: list



@dataclass(frozen=True)
class PrepareCallbacksConfig:
    root_dir: Path
    tensorboard_root_log_dir: Path
    checkpoint_model_filepath: Path

In [6]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories
import tensorflow as tf

In [7]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([Path(self.config.artifacts_root)])

    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        params = self.params

        training_data = Path(
            self.config.data_ingestion.unzip_dir
        ) / "amazon_data_science_books_cleaned.csv"

        create_directories([Path(training.root_dir)])

        return TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            training_data=training_data,
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            learning_rate=params.LEARNING_RATE,
            max_features=params.MAX_FEATURES,
            ngram_range=tuple(params.NGRAM_RANGE)
        )


In [8]:
import time

In [9]:
import joblib
import logging
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report
class TrainingCallback:
    def __init__(self, model_path: Path, log_path: Path):
        self.model_path = model_path
        self.log_path = log_path

        self._setup_logger()
        self.best_accuracy = 0.0

    def _setup_logger(self):
        self.log_path.parent.mkdir(parents=True, exist_ok=True)

        logging.basicConfig(
            filename=self.log_path,
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
        )

        self.logger = logging.getLogger(__name__)

    def on_train_end(self, model, X_test, y_test):
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        self.logger.info(f"Validation Accuracy: {acc:.4f}")
        self.logger.info("\n" + classification_report(y_test, y_pred))

        if acc > self.best_accuracy:
            self.best_accuracy = acc
            joblib.dump(model, self.model_path)
            self.logger.info("✅ Best model saved")

In [10]:
import os
import time
import urllib.request as request
from zipfile import ZipFile

import tensorflow as tf


In [11]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    learning_rate: float
    max_features: int
    ngram_range: tuple


In [12]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


class Training:
    def __init__(self, config):
        self.config = config

    def train(self):
        # Load CSV data
        df = pd.read_csv(self.config.training_data)

        # IMPORTANT: adjust column names if different
        X = df["description"]    # text column
        y = df["label"]          # target column

        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=0.2,
            random_state=42,
            stratify=y
        )

        model = Pipeline([
            ("tfidf", TfidfVectorizer(
                max_features=self.config.max_features,
                ngram_range=self.config.ngram_range
            )),
            ("classifier", LogisticRegression(
                max_iter=1000,
                n_jobs=-1
            ))
        ])

        model.fit(X_train, y_train)

        joblib.dump(model, self.config.trained_model_path)

        print("✅ Model trained and saved successfully")


In [13]:
from src.cnnClassifier.config.configuration import ConfigurationManager
from src.cnnClassifier.components.training import Training

config = ConfigurationManager()
training_config = config.get_training_config()

trainer = Training(config=training_config)
trainer.train()



[2025-12-19 11:19:13,462: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-12-19 11:19:13,472: INFO: common: yaml file: params.yaml loaded successfully]
[2025-12-19 11:19:13,487: INFO: common: created directory at: artifacts]
[2025-12-19 11:19:13,489: INFO: common: created directory at: artifacts/training]
CSV columns: ['title', 'author', 'price', 'pages', 'avg_reviews', 'n_reviews', 'star5', 'star4', 'star3', 'star2', 'star1', 'dimensions', 'weight', 'language', 'publisher', 'ISBN_13', 'link', 'complete_link']
✅ Model trained and saved successfully
