### Data Ingestion

In [2]:
### Document Data Structure

from langchain_core.documents import Document

Document(page_content="ok", metadata={})

Document(metadata={}, page_content='ok')

In [3]:
doc=Document(
    page_content="this is the main content I am using to create a RAG",
    metadata={
        "source": "example.txt",
        "pages": 1,
        "author": "Anshuman",
        "date_created": "2026-01-27"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Anshuman', 'date_created': '2026-01-27'}, page_content='this is the main content I am using to create a RAG')

In [4]:
### create a simple txt File
import os
os.makedirs('../data/text_files', exist_ok=True)

In [None]:
sample_text = {
    '../data/text_files/python_intro.txt':"""python programming introduction"""
}
for filepath, content in sample_text.items():
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

print("Sample txt file created!!!")

Sample txt file created!!!


In [5]:
### TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader('../data/text_files/python_intro.txt', encoding='utf-8')
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='python programming introduction')]


In [6]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    '../data/text_files',
    glob='**/*.txt', ## pattern to match the files
    loader_cls=TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False
)

documents=dir_loader.load()
documents

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='python programming introduction'),
 Document(metadata={'source': '../data/text_files/machine_learning_intro.txt'}, page_content='Machine learning introduction')]

In [2]:
### PDF files
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    '../data/pdf_files',
    glob='**/*.pdf', ## pattern to match the files
    loader_cls=PyMuPDFLoader, ##loader class to use
    show_progress=False
)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'xdvipdfmx (20250410)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-10-03T14:47:38+00:00', 'source': '../data/pdf_files/AnshResume.pdf', 'file_path': '../data/pdf_files/AnshResume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20251003144738Z', 'page': 0}, page_content='Anshuman\nAspiring Full Stack Developer\nanshuman302004@gmail.com | +91 9871980990 | Faridabad, Haryana, India\nlinkedin.com/in/heyansh | github.com/Ansh30a | Portfolio\nSUMMARY\nHighly motivated MERN Stack Developer skilled in building scalable full-stack applications using MongoDB,\nExpress.js, React, and Node.js. Experienced in developing RESTful APIs and integrating third-party services.\nQuick learner with strong problem-solving and debugging skills.\nEXPERIENCE\nMERN Stack Developer Intern\nJun 2025 – Jul 2025\nCodec Technologies\n• Developed full-stack

In [11]:
type(pdf_documents[0])

langchain_core.documents.base.Document

### Embedding and VectorStoreDB

In [7]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    def __init__(self, model_name: str = "all-miniLM-L6-v2"):
        """
        Initialise the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise