In [5]:
from langchain.document_loaders import DirectoryLoader, TextLoader, CSVLoader, PyPDFLoader 
from pathlib import Path

In [6]:
txt_file_loader = DirectoryLoader(
    "./files", 
    glob="**/*.txt", 
    loader_cls=TextLoader, 
    loader_kwargs={"encoding": "utf-8"}, 
    show_progress=False
)

In [7]:
documents = txt_file_loader.load()

In [8]:
documents

[Document(metadata={'source': 'files/Token.txt'}, page_content='Understanding Tokens and How AI Models Determine Output Length\nWhat is a Token?\nIn the context of Large Language Models (LLMs), a token is the fundamental unit of text that the model processes. A common misconception is that one token equals one word, but this is not always the case.\n\nA token can be:\n\nA full word (e.g., "apple", "run")\n\nA part of a word (e.g., "token", "iza", "tion" for the word "tokenization")\n\nPunctuation (e.g., ".", "!", "?")\n\nA space (e.g., " ")\n\nA special marker (e.g., [CLS] for "classification" or [EOS] for "end of sentence")\n\nThe process of converting a plain text string (like "Hello, world!") into a sequence of tokens is called tokenization. Models use a specific "tokenizer" with a fixed vocabulary. This is why the word "unbelievable" might be split into "un", "believe", and "able". This approach allows the model to handle a vast vocabulary and even words it has never seen before by

In [9]:
csv_file_loader = DirectoryLoader(
    "./files",
    glob = "**/*.csv",
    loader_cls=CSVLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
)

In [10]:
csv_files = csv_file_loader.load()

In [11]:
csv_files

[Document(metadata={'source': 'files/basic-data.csv', 'row': 0}, page_content='ID: 1\nName: Name_1\nAge: 62\nCountry: Country_1\nEmail: email_1@example.com'),
 Document(metadata={'source': 'files/basic-data.csv', 'row': 1}, page_content='ID: 2\nName: Name_2\nAge: 48\nCountry: Country_2\nEmail: email_2@example.com'),
 Document(metadata={'source': 'files/basic-data.csv', 'row': 2}, page_content='ID: 3\nName: Name_3\nAge: 61\nCountry: Country_3\nEmail: email_3@example.com'),
 Document(metadata={'source': 'files/basic-data.csv', 'row': 3}, page_content='ID: 4\nName: Name_4\nAge: 32\nCountry: Country_4\nEmail: email_4@example.com'),
 Document(metadata={'source': 'files/basic-data.csv', 'row': 4}, page_content='ID: 5\nName: Name_5\nAge: 69\nCountry: Country_5\nEmail: email_5@example.com'),
 Document(metadata={'source': 'files/basic-data.csv', 'row': 5}, page_content='ID: 6\nName: Name_6\nAge: 32\nCountry: Country_6\nEmail: email_6@example.com'),
 Document(metadata={'source': 'files/basic-dat

In [12]:
pdf_files_loader = DirectoryLoader(
    "./files",
    glob = "**/*.pdf",
    loader_cls=PyPDFLoader,
    # loader_kwargs={"encoding": "utf-8"},
    show_progress=False
)

In [13]:
pdf_files = pdf_files_loader.load()

In [14]:
pdf_files

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

In [15]:
pdf_files[0].metadata['source_file'] = './files'

In [16]:
path = Path("./files")
list(path.glob("**/*.pdf"))

[PosixPath('files/NIPS-2017-attention-is-all-you-need-Paper.pdf'),
 PosixPath('files/Cormen Introduction to Algorithms.pdf')]

In [50]:
def process_all_docs(directory):
    documents = [];
    path = Path(directory)
    
    ### for PDF
    pdf_files = list(path.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            document = loader.load()
            
            for doc in document:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
                
            documents.extend(document)
            print(f"Loaded {len(document)} pages")
        except Exception as e:
            print(f"Error: {e}")
    
    ### for CSV

In [47]:
documents = []
# function part 
csv_files = list(path.glob("**/*.csv"))
print(f"Found {len(csv_files)}")
for csv_file in csv_files:
    print(f"\nProcessing: {csv_file.name}")
    try:
        loader = CSVLoader(str(csv_file))
        document = loader.load()
        # print(document)
        for doc in document:
            # print(doc,"\n")
            doc.metadata['source_file'] = csv_file.name
            doc.metadata['file_type'] = 'csv'
            
        documents.extend(document)
        print(f"Loaded {len(document)} pages")
    except Exception as e:
        print(f"Error: {e}")


Found 1

Processing: basic-data.csv
Loaded 100 pages


In [55]:
documents =[]

text_files = list(path.glob("**/*.txt"))
print(f"Found {len(text_files)}")
for text_file in text_files:
    print(f"\nProcessing: {text_file.name}")
    try:
        loader = TextLoader(str(text_file))
        document = loader.load()
        for doc in document:
            doc.metadata['source_file'] = text_file.name
            doc.metadata['file_type'] = 'txt'
        print(text_file,document)
        documents.extend(document);
        print(f"Loaded {len(document)}")
    except Exception as e:
        print(f"Error: {e}")

Found 2

Processing: Token.txt
files/Token.txt [Document(metadata={'source': 'files/Token.txt', 'source_file': 'Token.txt', 'file_type': 'txt'}, page_content='Understanding Tokens and How AI Models Determine Output Length\nWhat is a Token?\nIn the context of Large Language Models (LLMs), a token is the fundamental unit of text that the model processes. A common misconception is that one token equals one word, but this is not always the case.\n\nA token can be:\n\nA full word (e.g., "apple", "run")\n\nA part of a word (e.g., "token", "iza", "tion" for the word "tokenization")\n\nPunctuation (e.g., ".", "!", "?")\n\nA space (e.g., " ")\n\nA special marker (e.g., [CLS] for "classification" or [EOS] for "end of sentence")\n\nThe process of converting a plain text string (like "Hello, world!") into a sequence of tokens is called tokenization. Models use a specific "tokenizer" with a fixed vocabulary. This is why the word "unbelievable" might be split into "un", "believe", and "able". This a

In [31]:
process_all_docs("./files")

Found 2 PDF files to process

Processing: NIPS-2017-attention-is-all-you-need-Paper.pdf
Loaded 11

Processing: Cormen Introduction to Algorithms.pdf
Loaded 1313
Found 1


In [58]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [59]:
class EmbeddingManager:
    """Handles document embedding generation using Sentence Transformer"""
    
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self.load_model()
        
    def load_model(self):
        """Load the Sentence Transformer model"""
        try:
            print(f"Loading embedding model: ", {self.model_name})
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error: {e}")
            raise # re-raising the error
    
    def generate_embeddings(self, texts):
        """
        Generate embeddings for a list texts

        Args:
            texts: List of text string to embed
        
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

In [60]:
em = EmbeddingManager()


Loading embedding model:  {'all-MiniLM-L6-v2'}
Model loaded successfully. Embedding dimension: 384


In [18]:
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
load_dotenv()

class GeminiLLM:
    def __init__(self, api_key=None, model_name="gemini-2.5-flash-lite"):
        """
        Initialize Google Gemini LLM

        Args:
            api_key (str, optional): Google Gemini API Key Defaults to "None"
            model_name (str, optional): Good model  Defaults to "gemini-2.5-flash-lite".
        """
        self.model = model_name
        self.api_key = api_key or os.environ.get("GOOGLE_API_KEY")
        if not self.api_key:
            raise ValueError("Google Gemini API Key is required. Set GOOGLE_API_KEY environment variable")
        
        self.llm = ChatGoogleGenerativeAI(
            google_api_key=self.api_key,
            temperature=0.1,
            model=self.model,
            model_kwargs={"max_token": 1024}
        )
        
        print(f"Initialized Google Gemini LLM with model: {self.model}")
    
    def query(self, query):
        response = self.llm.invoke(query)
        print(response)

In [19]:
gemini_llm = GeminiLLM()

Initialized Google Gemini LLM with model: gemini-2.5-flash-lite


In [20]:
gemini_llm.query("tell me about programming in computer")

content='Programming in computers is the process of **creating instructions that tell a computer what to do**. These instructions are written in a **programming language**, which is a set of rules and syntax that humans can understand and that computers can translate into machine code they can execute.\n\nThink of it like giving a recipe to a chef. The recipe is the program, the ingredients are the data, and the chef is the computer. The chef follows the instructions precisely to create the final dish.\n\nHere\'s a breakdown of key aspects of programming in computers:\n\n**1. What is a Program?**\n\n*   A program is a sequence of instructions designed to perform a specific task or set of tasks.\n*   These tasks can range from simple calculations to complex operations like running an operating system, browsing the web, playing games, or controlling robots.\n\n**2. Programming Languages:**\n\n*   **High-Level Languages:** These are closer to human language and are easier to read and writ

In [6]:
from src.rag_retriever import RAGRetriever
import time

In [None]:
class RAGPipeline:
    def __init__(self, retriever):
        self.retriever = retriever
        self.llm = GeminiLLM()
        self.history = []
        
    def query(self, question, top_k=5, min_score=0.2, stream=False, summarize=False):
        results = self.retriever.retrieve(query = question, top_k=top_k, score_threshold=min_score)
        if not results:
            answer = "No relevant context found." 
            sources = []
            context = "" 
        else :
            context = "\n\n".join([doc["content"] for doc in results])
            sources = [{
                "source": doc["metadata"].get("source_file", doc["metadata"].get("source", "unknown")),
                "page": doc["metadata"].get("page", "unknown"),
                "score": doc["similarity_score"],
                "preview": doc["content"][:120] + "...."
            } for doc in results]
            
            # Streaming answer simulation
            
            prompt= f"""Use the following context to answer the question concisely. \nContext:\n{context}\n\nQuestion:{question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end="", flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer=response.content
            print(answer)
        # Add citations to answer
        