In [1]:
%pwd

'c:\\Users\\NARINDER\\Desktop\\Local Chatbot\\research'

In [2]:
import os
os.chdir("..")

In [3]:
%pwd

'c:\\Users\\NARINDER\\Desktop\\Local Chatbot'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class PrepareBaseModelConfig:
    root_path: Path
    base_model_path: Path
    updated_base_model_path: Path
    params_model_name: str


In [None]:
from src.chatbot.constants import *
from src.chatbot.utils.common import read_yaml, create_directory

class ConfigurationManager:

    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directory([self.config[self.config.artifact_root]])

    def get_prepare_base_model_config(self) -> PrepareBaseModelConfig:
        config = self.config.prepare_base_model
        create_directory([config.root_dir])
        prepare_base_model_config = PrepareBaseModelConfig(
            root_path = Path(config.root_dir),
            base_model_path = Path(config.base_model_path),
            updated_base_model_path = Path(config.updated_base_model_path),
            params_model_name = self.params.model_name
        )
        return prepare_base_model_config

In [9]:
from pathlib import Path
import shutil
import json
from typing import Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_cpp import Llama
from src.chatbot import logger

class PrepareBaseModel:
    """
    Straightforward preparer modeled after the provided Keras example.
    - get_base_model: ensures `base_model_path` contains a HF model (copy or download).
    - update_base_model: prepares and writes the updated model to `updated_base_model_path`.
    - save_model: saves tokenizer and model in HF format.
    """

    META_NAME = "prepare_metadata.json"

    def __init__(self, config: PrepareBaseModelConfig):
        self.config = config
        self.base_model_path: Path = Path(config.base_model_path)
        self.updated_base_model_path: Path = Path(config.updated_base_model_path)
        self.params_model_name: str = config.params_model_name
        Path(config.root_path).mkdir(parents=True, exist_ok=True)

    def _is_valid_model_dir(self, p: Path) -> bool:
        if not p.exists() or not p.is_dir():
            return False
        if (p / "config.json").exists():
            return True
        if any(p.glob("pytorch_model*.bin")) or any(p.glob("*.bin")):
            return True
        if (p / "tokenizer_config.json").exists() or any(p.glob("tokenizer*")) or any(p.glob("vocab*")):
            return True
        return False

    def _write_metadata(self, source: str, status: str, extra: Optional[dict] = None):
        meta = {"source": source, "status": status}
        if extra:
            meta.update(extra)
        try:
            self.updated_base_model_path.mkdir(parents=True, exist_ok=True)
            with open(self.updated_base_model_path / self.META_NAME, "w", encoding="utf-8") as f:
                json.dump(meta, f, indent=2)
        except Exception:
            logger.exception("Failed to write metadata to %s", self.updated_base_model_path)

    def _copy_dir(self, src: Path, dst: Path):
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)

    def _download_from_hub(self, model_name: str, dst: Path):
        dst.mkdir(parents=True, exist_ok=True)
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        tok.save_pretrained(dst)
        model.save_pretrained(dst)

    def get_base_model(self) -> Path:
        """
        Ensure there is a base model available at `self.base_model_path`.
        Behavior:
          - If base_model_path is valid, return it.
          - Else if params_model_name provided, download model into base_model_path and return it.
          - Else raise FileNotFoundError.
        """
        # 1. If base exists and valid -> return
        if self._is_valid_model_dir(self.base_model_path):
            logger.info("Base model already present at %s", self.base_model_path)
            return self.base_model_path.resolve()

        # 2. Try to download from hub if model name provided
        if self.params_model_name:
            logger.info("Downloading base model '%s' into %s", self.params_model_name, self.base_model_path)
            try:
                # Ensure directory exists
                self.base_model_path.mkdir(parents=True, exist_ok=True)

                # Use llama_cpp to fetch and prepare the model
                llm = Llama.from_pretrained(
                    repo_id=self.params_model_name,
                    filename=getattr(self, "params_model_filename", None),
                    local_dir=str(self.base_model_path),
                )

                logger.info("Successfully loaded base model via llama_cpp into %s", self.base_model_path)
                return self.base_model_path.resolve()
            except Exception:
                logger.exception("Failed to download base model '%s'", self.params_model_name)

        # 3. Nothing worked -> error
        msg = f"No valid base model found at {self.base_model_path} and no hub model succeeded."
        logger.error(msg)
        raise FileNotFoundError(msg)

    def update_base_model(self) -> Path:
        """
        Ensure updated_base_model_path contains a usable model.
        Behavior:
          - If updated path already valid -> noop and return.
          - Else, attempt to copy from base_model_path -> updated_base_model_path.
          - If copy fails and params_model_name present, download to updated path.
          - On success write metadata and return updated path.
        """
        # If already valid, return early
        if self._is_valid_model_dir(self.updated_base_model_path):
            logger.info("Updated model already present at %s", self.updated_base_model_path)
            self._write_metadata(source=str(self.updated_base_model_path), status="already_present")
            return self.updated_base_model_path.resolve()

        # Try copying from base_model_path
        try:
            base_path = self.get_base_model()  # will raise if not available
            logger.info("Copying base model from %s to %s", base_path, self.updated_base_model_path)
            self._copy_dir(base_path, self.updated_base_model_path)
            self._write_metadata(source=str(base_path), status="copied")
            return self.updated_base_model_path.resolve()
        except Exception as e:
            logger.warning("Copy failed: %s", e)

        # Try downloading directly into updated path as fallback
        if self.params_model_name:
            try:
                logger.info("Downloading model '%s' into %s as fallback", self.params_model_name, self.updated_base_model_path)
                self._download_from_hub(self.params_model_name, self.updated_base_model_path)
                self._write_metadata(source=self.params_model_name, status="downloaded")
                return self.updated_base_model_path.resolve()
            except Exception:
                logger.exception("Failed to download model '%s' into %s", self.params_model_name, self.updated_base_model_path)

        # Final failure
        msg = (
            f"Could not prepare updated base model. Checked copy from {self.base_model_path} "
            f"and download of {self.params_model_name} into {self.updated_base_model_path}."
        )
        logger.error(msg)
        raise FileNotFoundError(msg)

    @staticmethod
    def save_model(path: Path, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
        """
        Save tokenizer and model in HF format to the given path.
        """
        path = Path(path)
        path.mkdir(parents=True, exist_ok=True)
        tokenizer.save_pretrained(path)
        model.save_pretrained(path)

In [None]:
try:
    config = ConfigurationManager()
    prepare_base_model_config = config.get_prepare_base_model_config()
    prepare_base_model = PrepareBaseModel(config= prepare_base_model_config)
    prepare_base_model.get_base_model()
    prepare_base_model.update_base_model()
except Exception as e:
    raise e

[2025-10-21 11:59:03,071: INFO: common]: Reading YAML file from config\config.yaml
[2025-10-21 11:59:03,073: INFO: common]: Reading YAML file from params.yaml
[2025-10-21 11:59:03,074: INFO: common]: Directory created at: artifacts


TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union