### Data Ingestion


In [2]:
from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content="Main text content i am using to make the RAG",
    metadata={
        "source": "example.txt",
        "author": "Akshat",
        "page": 1,
        "date_created": "2025-10-01",
        })

In [4]:
doc

Document(metadata={'source': 'example.txt', 'author': 'Akshat', 'page': 1, 'date_created': '2025-10-01'}, page_content='Main text content i am using to make the RAG')

In [5]:
## creating a txt file to test the document class

import os
os.makedirs("../data/text_files", exist_ok=True)

In [6]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [8]:
### Reading the files using textLoaded 

from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
data = loader.load()
print(data)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [11]:
### Directory loader

from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader("../data/text_files", glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding":"utf-8"}, show_progress=True)
dir_data = dir_loader.load()
print(dir_data)

100%|██████████| 2/2 [00:00<00:00, 1294.34it/s]

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '), Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popul




In [14]:
### Directory loader

from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader

dir_loader = DirectoryLoader("../data/pdf", glob="**/*.pdf", loader_cls=PyMuPDFLoader,show_progress=True)
pdf_data = dir_loader.load()
print(pdf_data)

100%|██████████| 2/2 [00:00<00:00, 19.00it/s]

[Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_path': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'total_pages': 17, 'format': 'PDF 1.4', 'title': 'Akshat Tyagi Final Internship Report 8th Sem', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Professional and Practical Internship Report \nSubmitted in partial fulfillment of the requirements for the award of \nthe \nBachelor of Technology in \nSchool of Computer Science, Engineering & Technology \nBennett University \n \n \nBy \n \nAkshat Tyagi \nRoll Number: E21CSEU0165'), Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\Akshat Tyagi Final Internship Report 8th Sem.pdf', 'file_path': '..\\data\\pdf\\Akshat Ty




In [17]:
type(pdf_data[0])

langchain_core.documents.base.Document

### Embeddings and vector store db

In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
class EmbeddingManager:
    """Manages embeddings and similarity search using ChromaDB and Sentence Transformers."""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """Load the sentence transformer model."""
        try:
            print(f"Loading model '{self.model_name}'...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model '{self.model_name}' loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model '{self.model_name}': {e}")
            raise
        
    def generate_embedding(self, text: List[str]) -> np.ndarray:
        """Generate embedding for a given text."""
        if not self.model:
            raise ValueError("Model not loaded.")

        print(f"Generating embeddings for {len(text)} texts...")
        embeddings= self.model.encode(text, show_progress_bar=True)
        print("Embeddings generated.")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        """Get the dimension of the embeddings."""
        if not self.model:
            raise ValueError("Model not loaded.")
        return self.model.get_sentence_embedding_dimension()
    
    
    
## Initialize the EmbeddingManager
embedding_manager = EmbeddingManager()
embedding_manager

Loading model 'all-MiniLM-L6-v2'...
Model 'all-MiniLM-L6-v2' loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x23da4af4440>