In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\anuru\\Desktop\\MyPro\\Medical-Chatbot-RAG-System'

### Entities

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    data_path: Path
    original_data_path: Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    data_path: Path
    data_ingestion_path: Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_validation_path: Path
    vectorstore_path: Path
    embedding_model_name: str
    chunk_size: int
    chunk_overlap: int

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    vectorstore_path: Path
    model_name: str
    

### Configuraiton Manager

In [4]:
from MedicalChatbot.constants import *
from MedicalChatbot.utils import read_yaml, create_directories

In [5]:
class ConfiguraitonManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir = Path(config.root_dir),
            data_path = Path(config.data_path),
            original_data_path = Path(config.original_data_path)
        )
        return data_ingestion_config
    
    def get_data_validation_config(self)->DataValidationConfig:
        config = self.config.data_validation
        create_directories([config.root_dir])
        data_validation_config = DataValidationConfig(
            root_dir = Path(config.root_dir),
            data_path = Path(config.data_path),
            data_ingestion_path = Path(config.data_ingestion_path),
        )
        return data_validation_config
    

    def get_data_transformation_config(self)->DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_confg = DataTransformationConfig(
            root_dir = Path(config.root_dir),
            data_validation_path = Path(config.data_validation_path),
            vectorstore_path = Path(config.vectorstore_path),
            embedding_model_name = config.embedding_model_name,
            chunk_size = self.params.CHUNK_SIZE,
            chunk_overlap = self.params.CHUNK_OVERLAP
        )
        return data_transformation_confg
    
    def get_model_trainer_config(self)->ModelTrainerConfig:
        config = self.config.model_trainer
        create_directories([config.root_dir])
        model_trainer_cofig = ModelTrainerConfig(
            root_dir = Path(config.root_dir),
            vectorstore_path = Path(config.vectorstore_path),
            model_name = config.model_name
        )
        return model_trainer_cofig



### Components

In [6]:
import os
from MedicalChatbot.logging import logger
import shutil

In [21]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config

    def download_data(self):
        print("Data downloaded successfully")
        logger.info("Data downloaded successfully...")
    
    def copy_data(self):
        source_dirpath  = self.config.original_data_path
        destination_dirpath = self.config.data_path
        create_directories([destination_dirpath])
        files = os.listdir(source_dirpath)
        for file in files:
            file_type = file.split(".")[-1].lower()
            if file_type == "pdf":
                shutil.copy(os.path.join(source_dirpath,file),destination_dirpath)
        logger.info("Data copied successfully...")


class DataValidation:
    def __init__(self,config:DataValidationConfig):
        self.config = config

    def data_validation(self):
        source_dirpath = self.config.data_ingestion_path
        destination_dirpath = self.config.data_path
        create_directories([destination_dirpath])
        files = os.listdir(source_dirpath)
        for file in files:
            shutil.copy(os.path.join(source_dirpath,file),destination_dirpath)
        logger.info("Data validation completed...")


from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config = config
    
    def _get_embedding_model(self):
        embedding = HuggingFaceEmbeddings(model_name = self.config.embedding_model_name)
        logger.info("Embedding model retrieved...")
        return embedding
    
    def _get_inmemory_vectorstore(self,embedding):   
        vector_store = InMemoryVectorStore(embedding)
        logger.info("Vector store retrieved...")
        return vector_store
    
    def load_pdf_documents(self):
        loader = PyPDFDirectoryLoader(self.config.data_validation_path)
        docs = loader.load()
        logger.info("Documents are loaded...")
        return docs
    
    def split_pdf_loaded_documents(self,docs):

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.config.chunk_size,
            chunk_overlap=self.config.chunk_overlap,
            add_start_index=True
        )
        all_splits = text_splitter.split_documents(docs)
        logger.info("PDF loaded docs are splitted...")
        return all_splits
    
    def create_and_save_vectorstore(self,all_splits):
        embeddings = self._get_embedding_model()
        vector_store = self._get_inmemory_vectorstore(embeddings)
        document_ids = vector_store.add_documents(all_splits)
        vector_store.dump(self.config.vectorstore_path)
        logger.info("Vector store created and stored...")
        return vector_store

    def get_saved_vectorsote(self):
        vector_store = InMemoryVectorStore.load(self.config.vectorstore_path, self._get_embedding_model())
        logger.info("Vector store retrieved...")
        return vector_store
    

from langchain.agents.middleware import dynamic_prompt, ModelRequest
from langchain.agents import create_agent
from langchain.chat_models import init_chat_model

class ModelTrainer:
    def __init__(self,config:ModelTrainerConfig):
        self.config = config
        self.data_transformation = self._get_data_transformation_object()
        self.model = self._get_model()

    def _get_model(self):
        model = init_chat_model(self.config.model_name)
        logger.info("LLM model is retrieved...")
        return model
    
    def _get_data_transformation_object(self):
        config_manager = ConfiguraitonManager()
        data_transformation_obj = DataTransformation(config_manager.get_data_transformation_config())
        return data_transformation_obj


    def generate_system_message(self,user_query):
        vector_store = self.data_transformation.get_saved_vectorsote()
        retrieved_docs = vector_store.similarity_search(user_query)
        docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
        system_message = (
            "Yor are a helpful assistant. Use the following context in your response:"
            f"\n\n{docs_content}"
        )
        logger.info("System prompt is generated...")
        return system_message
    

    def get_agent(self):
        agent = create_agent(self.model)
        logger.info("Agent is created...")
        return agent
    
    
    def get_result(self,query,agent):
        resulst = agent.invoke({"messages": [
            {"role":"system", "content":self.generate_system_message(query)},
            {"role":"user", "content":query}
        ]})

        client_result = resulst["messages"][-1].content
        return client_result

    

### Pipeline

In [8]:
import sys
from MedicalChatbot.exception import CustomException

In [9]:
try:
    config = ConfiguraitonManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(data_ingestion_config)
    data_ingestion.copy_data()
except Exception as e:
    raise CustomException(e,sys)

[2026-02-10 11:17:17,866: INFO: __init__: Directory: artifacts is created:]
[2026-02-10 11:17:17,869: INFO: __init__: Directory: artifacts/data_ingestion is created:]
[2026-02-10 11:17:17,871: INFO: __init__: Directory: artifacts\data_ingestion\data is created:]
[2026-02-10 11:17:17,887: INFO: 1233119677: Data copied successfully...:]


In [10]:
try:
    config = ConfiguraitonManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.data_validation()
except Exception as e:
    raise CustomException(e,sys)

[2026-02-10 11:17:17,915: INFO: __init__: Directory: artifacts is created:]
[2026-02-10 11:17:17,919: INFO: __init__: Directory: artifacts/data_validation is created:]
[2026-02-10 11:17:17,922: INFO: __init__: Directory: artifacts\data_validation\train is created:]
[2026-02-10 11:17:17,984: INFO: 1233119677: Data validation completed...:]


In [11]:
try:
    config = ConfiguraitonManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    docs = data_transformation.load_pdf_documents()
    all_splits = data_transformation.split_pdf_loaded_documents(docs)
    #vector_store = data_transformation.create_and_save_vectorstore(all_splits)
    #vector_store = data_transformation.get_saved_vectorsote()
except Exception as e:
    raise CustomException(e,sys)

[2026-02-10 11:17:18,008: INFO: __init__: Directory: artifacts is created:]
[2026-02-10 11:17:18,013: INFO: __init__: Directory: artifacts/data_transformation is created:]
[2026-02-10 11:17:40,195: INFO: 1233119677: Documents are loaded...:]
[2026-02-10 11:17:40,297: INFO: 1233119677: PDF loaded docs are splitted...:]


In [22]:
try:
    config = ConfiguraitonManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(model_trainer_config)
    agent = model_trainer.get_agent()

    user_query = "What is IBD ?"
    result = model_trainer.get_result(user_query,agent)
    print(result)
except Exception as e:
    raise CustomException(e,sys)

[2026-02-10 11:32:46,533: INFO: __init__: Directory: artifacts is created:]
[2026-02-10 11:32:46,534: INFO: __init__: Directory: artifacts/model_trainer is created:]
[2026-02-10 11:32:46,541: INFO: __init__: Directory: artifacts is created:]
[2026-02-10 11:32:46,543: INFO: __init__: Directory: artifacts/data_transformation is created:]
[2026-02-10 11:32:47,018: INFO: 2818467830: LLM model is retrieved...:]
[2026-02-10 11:32:47,021: INFO: 2818467830: Agent is created...:]
[2026-02-10 11:32:47,023: INFO: SentenceTransformer: Use pytorch device_name: cpu:]
[2026-02-10 11:32:47,025: INFO: SentenceTransformer: Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2:]
[2026-02-10 11:32:47,717: INFO: _client: HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect":]
[2026-02-10 11:32:47,921: INFO: _client: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transf

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mMPNetModel LOAD REPORT[0m from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


[2026-02-10 11:32:51,571: INFO: _client: HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect":]
[2026-02-10 11:32:51,710: INFO: _client: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-mpnet-base-v2/e8c3b32edf5434bc2275fc9bab85f82640a19130/config.json "HTTP/1.1 200 OK":]
[2026-02-10 11:32:52,007: INFO: _client: HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/tokenizer_config.json "HTTP/1.1 307 Temporary Redirect":]
[2026-02-10 11:32:52,137: INFO: _client: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-mpnet-base-v2/e8c3b32edf5434bc2275fc9bab85f82640a19130/tokenizer_config.json "HTTP/1.1 200 OK":]
[2026-02-10 11:32:52,532: INFO: _client: HTTP Request: GET https://huggingface.co/api/models/sentence-transformers/all-mpnet-base-v2/tree/main/additional_chat_templates?r

In [23]:
type(agent)

langgraph.graph.state.CompiledStateGraph