## Naive Rag : dense retrieval

### lets use ollama

In [None]:
!pip install ollama chromadb PyPDF2

In [1]:
import ollama
import chromadb
import PyPDF2
import os
import re
import uuid



In [2]:
def extract_pdf_text(pdf_path):
    """Extract text from a PDF file - simple version"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
    except:
        print(f"Could not read {pdf_path}")
    return text

pdf_file = "1706.03762v7.pdf"  
if os.path.exists(pdf_file):
    raw_text = extract_pdf_text(pdf_file)
    print(f"Extracted {len(raw_text)} characters from PDF")
else:
    # Use sample text for testing
    raw_text = """
    Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data.
    Deep learning uses neural networks with multiple layers to model complex patterns in data.
    Natural language processing enables computers to understand and generate human language.
    Computer vision allows machines to interpret and understand visual information from images.
    Artificial intelligence is transforming many industries through automation and intelligent decision making.
    """
    print("Using sample text since no PDF found")

Extracted 39487 characters from PDF


In [3]:
def clean_text(text):
    """Clean up the text"""
    # Remove extra spaces and weird characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    return text.strip()

def split_into_chunks(text, chunk_size=200):
    """Split text into smaller chunks"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks

# Clean and split our text
clean_text_content = clean_text(raw_text)
text_chunks = split_into_chunks(clean_text_content)

print(f"Created {len(text_chunks)} chunks")
for i, chunk in enumerate(text_chunks[:3]):  # Show first 3 chunks
    print(f"\nChunk {i+1}: {chunk[:100]}...")

Created 30 chunks

Chunk 1: Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and...

Chunk 2: fraction of the training costs of the best models from the literature. We show that the Transformer ...

Chunk 3: 7 neural networks in particular, have been firmly established as state of the art approaches in sequ...


In [4]:
def get_embedding(text):
    """Get embedding for one piece of text"""
    try:
        response = ollama.embeddings(model="nomic-embed-text", prompt=text)
        return response['embedding']
    except:
        print(f"Error getting embedding for: {text[:50]}...")
        return []

# Get embeddings for all chunks
print("Getting embeddings from Ollama...")
embeddings = []

for i, chunk in enumerate(text_chunks):
    print(f"Processing chunk {i+1}/{len(text_chunks)}")
    embedding = get_embedding(chunk)
    embeddings.append(embedding)

print(f"Got {len(embeddings)} embeddings")
print(f"Each embedding has {len(embeddings[0])} dimensions")

Getting embeddings from Ollama...
Processing chunk 1/30
Processing chunk 2/30
Processing chunk 3/30
Processing chunk 4/30
Processing chunk 5/30
Processing chunk 6/30
Processing chunk 7/30
Processing chunk 8/30
Processing chunk 9/30
Processing chunk 10/30
Processing chunk 11/30
Processing chunk 12/30
Processing chunk 13/30
Processing chunk 14/30
Processing chunk 15/30
Processing chunk 16/30
Processing chunk 17/30
Processing chunk 18/30
Processing chunk 19/30
Processing chunk 20/30
Processing chunk 21/30
Processing chunk 22/30
Processing chunk 23/30
Processing chunk 24/30
Processing chunk 25/30
Processing chunk 26/30
Processing chunk 27/30
Processing chunk 28/30
Processing chunk 29/30
Processing chunk 30/30
Got 30 embeddings
Each embedding has 768 dimensions


In [None]:
# Create ChromaDB client and collection
client = chromadb.PersistentClient(path="./simple_chroma_db")

# # Delete collection if it exists (fresh start)
# try:
#     client.delete_collection("simple_rag")
# except:
#     pass


# Create new collection
collection = client.create_collection(
    name="simple_rag")

print("Created ChromaDB collection")



Created ChromaDB collection


In [15]:
# Generate IDs and metadata for each chunk
chunk_ids = [f"chunk_{i}" for i in range(len(text_chunks))]
chunk_metadata = [{"chunk_number": i, "source": "document"} for i in range(len(text_chunks))]

# Add everything to ChromaDB
collection.add(
    documents=text_chunks,
    embeddings=embeddings,
    ids=chunk_ids,
    metadatas=chunk_metadata
)

print(f"Added {len(text_chunks)} chunks to ChromaDB")
print(f"Collection now has {collection.count()} documents")

Added 30 chunks to ChromaDB
Collection now has 30 documents


In [39]:
def search_documents(query, n_results=3):
    """Search for similar documents"""
    # Get embedding for the query
    query_embedding = get_embedding(query)
    
    # Search ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=['documents', 'distances', 'metadatas']
    )
    print(results)    
    return results
    

# Test the search
test_query = "What is attention is all you need ?"
search_results = search_documents(test_query)

print(f"Search results for: '{test_query}'")
print("-" * 50)

for i in range(len(search_results['documents'][0])):
    doc = search_results['documents'][0][i]
    distance = search_results['distances'][0][i]
    similarity = 1 - distance  # Convert distance to similarity
    
    print(f"\nResult {i+1} (Similarity: {similarity:.3f}):")
    print(f"{doc}")

Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


{'ids': [['chunk_0', 'chunk_1']], 'distances': [[0.5056214568667006, 0.6276617448829913]], 'metadatas': [[{'chunk_number': 0, 'source': 'document'}, {'chunk_number': 1, 'source': 'document'}]], 'embeddings': None, 'documents': [['Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data without being explicitly programmed. There are three main types of machine learning supervised learning uses labeled data to train models, unsupervised learning finds patterns in unlabeled data, and reinforcement learning learns through trial and error with rewards. Deep learning uses neural networks with multiple layers to model complex patterns in data. Each layer learns increasingly abstract features. Natural language processing enables computers to understand and generate human language through techniques like tokenization, parsing, and semantic analysis. Computer vision allows machines to interpret visual information from images using convolutional 

In [None]:
def generate_answer(query, search_results):
    """Generate answer using retrieved documents"""
    # Get the documents from search results
    documents = search_results['documents'][0]
    
    # Create context from retrieved documents
    context = "\n\n".join([f"Context {i+1}: {doc}" for i, doc in enumerate(documents)])
    
    # Create the prompt
    prompt = f"""Based on the following contexts, answer the question.
    If you can't find the answer in the contexts, say so.

Contexts:
{context}

Question: {query}

Answer:"""
    
    # Get response from Ollama
    response = ollama.chat(
        model="llama3.2:1b",  # or whatever model you have
        messages=[{"role": "system", "content": 'you are a ML expert',
                    "role": "user", "content": prompt}]
    )
    
    return response['message']['content']

# Generate answer for our test query
answer = generate_answer(test_query, search_results)

print(f"Question: {test_query}")
print(f"Answer: {answer}")

Question: What is attention is all you need ?
Answer: Based on the contexts provided, it appears that "attention" is often mentioned in relation to self-attention mechanisms, which are a type of neural network layer used for various tasks such as processing sequential data (e.g., natural language text) and understanding context.

In many of these contexts, attention is described as being necessary or crucial for certain tasks. For example, in Context 1, "but its application should be just - this is what we are missing , in my opinion . EOS pad The Law will never be perfect , but its application should be just - this is what we are missing , in my opinion ." suggests that the law (EOS pad) requires attention to be applied. Similarly, in Context 2, "by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions." implies that self-attention is necessary for achieving desired results.

In Context 3, "hea

Task : add multiple documents

## Chain Of Thoughts With Rag

In [None]:
!pip install ollama chromadb PyPDF2

In [None]:
import ollama
import chromadb
import PyPDF2
import os
import re
import uuid

In [24]:
def extract_pdf_text(pdf_path):
    """Extract text from a PDF file - simple version"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
    except:
        print(f"Could not read {pdf_path}")
    return text

# Test it (replace with your PDF path or skip if no PDF)
pdf_file = "your_document.pdf"  # Change this path
if os.path.exists(pdf_file):
    raw_text = extract_pdf_text(pdf_file)
    print(f"Extracted {len(raw_text)} characters from PDF")
else:
    # Use sample text for testing - more detailed for CoT
    raw_text = """
    Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data without being explicitly programmed.
    There are three main types of machine learning: supervised learning uses labeled data to train models, unsupervised learning finds patterns in unlabeled data, and reinforcement learning learns through trial and error with rewards.
    Deep learning uses neural networks with multiple layers to model complex patterns in data. Each layer learns increasingly abstract features.
    Natural language processing enables computers to understand and generate human language through techniques like tokenization, parsing, and semantic analysis.
    Computer vision allows machines to interpret visual information from images using convolutional neural networks and feature detection algorithms.
    Artificial intelligence is transforming industries through automation, predictive analytics, and intelligent decision making systems.
    Neural networks are inspired by biological neurons and consist of interconnected nodes that process information through weighted connections.
    Training a neural network involves adjusting weights through backpropagation to minimize prediction errors on training data.
    """
    print("Using detailed sample text since no PDF found")

Using detailed sample text since no PDF found


In [25]:
def clean_text(text):
    """Clean up the text"""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    return text.strip()

def split_into_chunks(text, chunk_size=150):
    """Split text into smaller chunks - smaller for CoT"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks

# Clean and split our text
clean_text_content = clean_text(raw_text)
text_chunks = split_into_chunks(clean_text_content)

print(f"Created {len(text_chunks)} chunks")
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\nChunk {i+1}: {chunk[:100]}...")

Created 2 chunks

Chunk 1: Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn fr...

Chunk 2: through backpropagation to minimize prediction errors on training data....


In [26]:
def get_embedding(text):
    """Get embedding for one piece of text"""
    try:
        response = ollama.embeddings(model="nomic-embed-text", prompt=text)
        return response['embedding']
    except:
        print(f"Error getting embedding for: {text[:50]}...")
        return []

# Get embeddings for all chunks
print("Getting embeddings from Ollama...")
embeddings = []

for i, chunk in enumerate(text_chunks):
    print(f"Processing chunk {i+1}/{len(text_chunks)}")
    embedding = get_embedding(chunk)
    embeddings.append(embedding)

print(f"Got {len(embeddings)} embeddings")

Getting embeddings from Ollama...
Processing chunk 1/2
Processing chunk 2/2
Got 2 embeddings


In [None]:
# Create ChromaDB client and collection
client = chromadb.PersistentClient(path="./cot_chroma_db")

# # Delete collection if it exists (fresh start)
# try:
#     client.delete_collection("cot_rag")
# except:
#     pass

# Create new collection
collection = client.create_collection(
    name="cot_rag")

print("Created ChromaDB collection for CoT RAG")

Created ChromaDB collection for CoT RAG


In [28]:
# Generate IDs and metadata
chunk_ids = [f"chunk_{i}" for i in range(len(text_chunks))]
chunk_metadata = [{"chunk_number": i, "source": "document"} for i in range(len(text_chunks))]

# Add everything to ChromaDB
collection.add(
    documents=text_chunks,
    embeddings=embeddings,
    ids=chunk_ids,
    metadatas=chunk_metadata
)

print(f"Added {len(text_chunks)} chunks to ChromaDB")
print(f"Collection now has {collection.count()} documents")

Added 2 chunks to ChromaDB
Collection now has 2 documents


In [31]:
def search_documents(query, n_results=5):
    """Search for relevant documents"""
    query_embedding = get_embedding(query)
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=['documents', 'distances', 'metadatas']
    )
    
    return results


In [34]:
def call_llm(prompt, model="llama3.2:1b"):
    """Make a single LLM call"""
    try:
        response = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": prompt}]
        )
        return response['message']['content']
    except Exception as e:
        return f"Error: {e}"

# Test basic functions
test_search = search_documents("What is machine learning?")
print(f"Found {len(test_search['documents'][0])} relevant documents")

Number of requested results 5 is greater than number of elements in index 2, updating n_results = 2


Found 2 relevant documents


In [35]:
def step1_analyze_question(question):
    """Step 1: Analyze what the question is really asking"""
    
    prompt = f"""Analyze this question and break it down:

Question: {question}

Please identify:
1. What is the main topic being asked about?
2. What specific aspects or details are being requested?
3. What type of answer would be most helpful (definition, explanation, comparison, process, etc.)?
4. Are there any sub-questions or related concepts to address?

Provide a clear analysis:"""

    print("🔍 STEP 1: Analyzing the question...")
    analysis = call_llm(prompt)
    print(f"Analysis: {analysis}")
    return analysis

# Test Step 1
test_question = "How does deep learning differ from traditional machine learning?"
question_analysis = step1_analyze_question(test_question)

🔍 STEP 1: Analyzing the question...
Analysis: Analysis of the question:

1. The main topic being asked about is "deep learning" and its differences from traditional machine learning.
2. The specific aspects or details being requested are the fundamental differences between deep learning and traditional machine learning.
3. A most helpful type of answer would be one that provides a clear explanation, definition, comparison, or process-based analysis to facilitate understanding.

Breakdown of the question:

- "Deep learning" refers to a subset of machine learning techniques that use neural networks with multiple layers to learn complex patterns in data. It's an area of artificial intelligence (AI) where computers are trained by mimicking how humans think.
- Traditional machine learning, on the other hand, focuses on simpler algorithms and models designed to make predictions or classify data without explicitly learning a pattern through experience.

Helpful answer types:

1. Definition: A

In [36]:
def step2_evaluate_sources(question, question_analysis, search_results):
    """Step 2: Evaluate which sources are most relevant"""
    
    # Prepare sources text
    sources_text = ""
    for i, doc in enumerate(search_results['documents'][0]):
        sources_text += f"Source {i+1}: {doc}\n\n"
    
    prompt = f"""Based on the question analysis, evaluate these sources:

Original Question: {question}

Question Analysis: {question_analysis}

Available Sources:
{sources_text}

For each source, determine:
1. How relevant is it to answering the question?
2. What specific information does it provide?
3. Which sources should be prioritized?
4. Are there any gaps in the available information?

Provide your source evaluation:"""

    print("📊 STEP 2: Evaluating sources...")
    evaluation = call_llm(prompt)
    print(f"Source Evaluation: {evaluation}")
    return evaluation

# Test Step 2
search_results = search_documents(test_question)
source_evaluation = step2_evaluate_sources(test_question, question_analysis, search_results)

Number of requested results 5 is greater than number of elements in index 2, updating n_results = 2


📊 STEP 2: Evaluating sources...
Source Evaluation: Based on the analysis of the original question and its breakdown, here's an evaluation of each source:

1. **Source 1**:
	* Relevance: 6/10 (While it provides some general information about machine learning, it doesn't specifically address the differences between deep learning and traditional machine learning.)
	* Specificity: 4/10 (It mentions supervised, unsupervised, and reinforcement learning, but doesn't provide an in-depth analysis of these concepts or their applications.)
	* Information provided: The source provides a brief overview of machine learning as a subset of AI.
	* Priority: Should be prioritized for its general information on machine learning.
2. **Source 2**:
	* Relevance: 9/10 (This source explicitly mentions deep learning and traditional machine learning, providing a clear comparison between the two.)
	* Specificity: 8/10 (It provides detailed explanations of neural networks, backpropagation, and their applications 

In [37]:
def step3_synthesize_information(question, question_analysis, source_evaluation, search_results):
    """Step 3: Synthesize information from sources"""
    
    sources_text = ""
    for i, doc in enumerate(search_results['documents'][0]):
        sources_text += f"Source {i+1}: {doc}\n\n"
    
    prompt = f"""Now synthesize the information to build toward an answer:

Question: {question}
Question Analysis: {question_analysis}
Source Evaluation: {source_evaluation}

Sources:
{sources_text}

Based on your analysis and evaluation:
1. What are the key points from the most relevant sources?
2. How do these points relate to each other?
3. What patterns or connections can you identify?
4. What is the logical flow for presenting this information?

Provide your information synthesis:"""

    print("🔗 STEP 3: Synthesizing information...")
    synthesis = call_llm(prompt)
    print(f"Information Synthesis: {synthesis}")
    return synthesis

# Test Step 3
info_synthesis = step3_synthesize_information(test_question, question_analysis, source_evaluation, search_results)

🔗 STEP 3: Synthesizing information...
Information Synthesis: The main difference between deep learning and traditional machine learning lies in their approaches to modeling complex patterns in data.

Traditional machine learning relies on simpler algorithms and models that focus on making predictions or classifications without explicitly learning a pattern through experience. This approach is often limited by the availability of large amounts of training data, which can be difficult to obtain for small datasets. In contrast, deep learning uses neural networks with multiple layers to model complex patterns in data, allowing it to handle high-dimensional data and make accurate predictions.

The key points that emerge from this analysis are:

* Traditional machine learning models rely on simpler algorithms and models, whereas deep learning models use neural networks with many interconnected nodes (neurons) to model complex patterns.
* Deep learning can handle high-dimensional data, wherea

In [38]:
def step4_construct_answer(question, question_analysis, source_evaluation, synthesis):
    """Step 4: Construct the final answer"""
    
    prompt = f"""Now construct the final comprehensive answer:

Original Question: {question}

Your Previous Analysis:
- Question Analysis: {question_analysis}
- Source Evaluation: {source_evaluation}  
- Information Synthesis: {synthesis}

Based on all your previous thinking, provide a complete, well-structured answer that:
1. Directly addresses the original question
2. Uses the most relevant information you identified
3. Follows the logical flow you developed
4. Is clear and comprehensive

Final Answer:"""

    print("✅ STEP 4: Constructing final answer...")
    final_answer = call_llm(prompt)
    print(f"Final Answer: {final_answer}")
    return final_answer

# Test Step 4
final_answer = step4_construct_answer(test_question, question_analysis, source_evaluation, info_synthesis)

✅ STEP 4: Constructing final answer...
Final Answer: ## Step 1: Identify the key differences between deep learning and traditional machine learning.
The key differences between deep learning and traditional machine learning are their approach to handling complex data, the complexity of an approach, and the modeling capabilities.

## Step 2: Determine the relevance of each point in understanding the difference between deep learning and traditional machine learning.
Traditional machine learning relies on simpler algorithms and models that focus on making predictions or classifications without explicitly learning a pattern through experience. In contrast, deep learning uses neural networks with multiple layers to model complex patterns in data.

## Step 3: Explain how these key points relate to each other.
The simplicity or complexity of an approach is reflected in the number of layers and interconnected nodes used in the model. The ability of a machine learning approach to handle complex

## Keyword search

What is BM25? 🤔

BM25 = "Best Matching 25" - it's a keyword scoring algorithm that figures out how relevant a document is to your search query.

The Simple Goal 🎯
BM25's job: Given a search query like "machine learning", score every document from 0 to infinity based on how well it matches. Higher score = better match.

How BM25 Thinks 🧠
BM25 asks 3 simple questions about each document:

1. Term Frequency (TF) - "How often does this word appear?"
2. Document Frequency (DF) - "How rare is this word?"
3. Document Length - "Is this document too long or too short?"

BM25 Score = TF × IDF × Length Penalty




BM25

In [None]:
!pip install ollama langchain langchain-community PyPDF2 rank-bm25

In [41]:
import ollama
import PyPDF2
import os
import re
from langchain.retrievers import BM25Retriever
from langchain.schema import Document
from rank_bm25 import BM25Okapi

In [42]:
def extract_pdf_text(pdf_path):
    """Extract text from a PDF file"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
    except:
        print(f"Could not read {pdf_path}")
    return text

# Test it
pdf_file = "your_document.pdf"  # Change this path
if os.path.exists(pdf_file):
    raw_text = extract_pdf_text(pdf_file)
    print(f"Extracted {len(raw_text)} characters from PDF")
else:
    # Sample text with good keywords for BM25 testing
    raw_text = """
    Machine learning algorithms learn patterns from data to make predictions without explicit programming.
    Supervised learning uses labeled training data with input-output pairs to train predictive models.
    Unsupervised learning discovers hidden patterns and structures in unlabeled data without target variables.
    Reinforcement learning agents learn optimal actions through trial and error using reward signals.
    Deep learning neural networks have multiple hidden layers for learning complex hierarchical representations.
    Convolutional neural networks excel at image recognition tasks using convolution and pooling operations.
    Recurrent neural networks process sequential data like text and time series using memory cells.
    Natural language processing combines linguistics and machine learning for text understanding and generation.
    Computer vision algorithms analyze digital images and videos to extract meaningful information.
    Classification algorithms predict discrete categories while regression algorithms predict continuous values.
    Clustering algorithms group similar data points together without labeled examples.
    Decision trees create interpretable models using if-then rules for classification and regression.
    Random forests combine multiple decision trees to improve accuracy and reduce overfitting.
    Support vector machines find optimal decision boundaries for classification problems.
    """
    print("Using sample text with good keywords for BM25")

Using sample text with good keywords for BM25


In [43]:
def clean_text(text):
    """Clean up the text"""
    # Keep more words for BM25 - it needs keywords
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    return text.strip()

def split_into_chunks(text, chunk_size=100):
    """Split text into chunks - smaller for better keyword matching"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks

# Clean and split our text
clean_text_content = clean_text(raw_text)
text_chunks = split_into_chunks(clean_text_content)

print(f"Created {len(text_chunks)} chunks for BM25")
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\nChunk {i+1}: {chunk}")

Created 2 chunks for BM25

Chunk 1: Machine learning algorithms learn patterns from data to make predictions without explicit programming. Supervised learning uses labeled training data with input-output pairs to train predictive models. Unsupervised learning discovers hidden patterns and structures in unlabeled data without target variables. Reinforcement learning agents learn optimal actions through trial and error using reward signals. Deep learning neural networks have multiple hidden layers for learning complex hierarchical representations. Convolutional neural networks excel at image recognition tasks using convolution and pooling operations. Recurrent neural networks process sequential data like text and time series using memory cells. Natural language processing combines linguistics and machine learning

Chunk 2: for text understanding and generation. Computer vision algorithms analyze digital images and videos to extract meaningful information. Classification algorithms predic

In [44]:
def create_langchain_documents(text_chunks):
    """Convert text chunks to LangChain Document objects"""
    documents = []
    
    for i, chunk in enumerate(text_chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "chunk_id": i,
                "source": "document",
                "chunk_number": i
            }
        )
        documents.append(doc)
    
    return documents

# Create LangChain documents
langchain_docs = create_langchain_documents(text_chunks)

print(f"Created {len(langchain_docs)} LangChain documents")
print(f"Sample document: {langchain_docs[0].page_content[:100]}...")
print(f"Sample metadata: {langchain_docs[0].metadata}")

Created 2 LangChain documents
Sample document: Machine learning algorithms learn patterns from data to make predictions without explicit programmin...
Sample metadata: {'chunk_id': 0, 'source': 'document', 'chunk_number': 0}


In [45]:
# Create BM25 retriever using LangChain
bm25_retriever = BM25Retriever.from_documents(langchain_docs)

# Set how many documents to retrieve
bm25_retriever.k = 4  # Return top 4 most relevant documents

print("BM25 Retriever created!")
print(f"Configured to return top {bm25_retriever.k} documents")
print(f"Total documents indexed: {len(langchain_docs)}")

BM25 Retriever created!
Configured to return top 4 documents
Total documents indexed: 2


In [46]:
def test_bm25_search(query):
    """Test BM25 search with a query"""
    print(f"🔍 BM25 Search for: '{query}'")
    print("-" * 50)
    
    # Search using BM25
    results = bm25_retriever.get_relevant_documents(query)
    
    print(f"Found {len(results)} relevant documents:")
    
    for i, doc in enumerate(results):
        print(f"\nResult {i+1}:")
        print(f"Content: {doc.page_content}")
        print(f"Metadata: {doc.metadata}")
        print("-" * 30)
    
    return results

# Test BM25 search
test_query = "machine learning algorithms"
search_results = test_bm25_search(test_query)

🔍 BM25 Search for: 'machine learning algorithms'
--------------------------------------------------
Found 2 relevant documents:

Result 1:
Content: Machine learning algorithms learn patterns from data to make predictions without explicit programming. Supervised learning uses labeled training data with input-output pairs to train predictive models. Unsupervised learning discovers hidden patterns and structures in unlabeled data without target variables. Reinforcement learning agents learn optimal actions through trial and error using reward signals. Deep learning neural networks have multiple hidden layers for learning complex hierarchical representations. Convolutional neural networks excel at image recognition tasks using convolution and pooling operations. Recurrent neural networks process sequential data like text and time series using memory cells. Natural language processing combines linguistics and machine learning
Metadata: {'chunk_id': 0, 'source': 'document', 'chunk_number': 0

  warn_deprecated(


In [48]:
def call_ollama(prompt):
    """Simple Ollama call"""
    try:
        response = ollama.chat(
            model="llama3.2:1b",
            messages=[{"role": "user", "content": prompt}]
        )
        return response['message']['content']
    except Exception as e:
        return f"Error: {e}"

def generate_answer_from_bm25(query, bm25_results):
    """Generate answer using BM25 retrieved documents"""
    
    # Combine retrieved documents
    context = ""
    for i, doc in enumerate(bm25_results):
        context += f"Source {i+1}: {doc.page_content}\n\n"
    
    # Create prompt
    prompt = f"""Based on the following sources found through keyword search, answer the question.

Sources:
{context}

Question: {query}

Answer:"""
    
    # Get answer from Ollama
    answer = call_ollama(prompt)
    return answer

# Test answer generation
answer = generate_answer_from_bm25(test_query, search_results)
print(f"\nQuestion: {test_query}")
print(f"BM25 Answer: {answer}")


Question: machine learning algorithms
BM25 Answer: Based on the provided sources, machine learning algorithms are categorized into three main types:

1. **Supervised Learning**: Uses labeled training data with input-output pairs to train predictive models (Source 1).
2. **Unsupervised Learning**: Discovers hidden patterns and structures in unlabeled data without target variables (Source 1).
3. **Reinforcement Learning**: Learns optimal actions through trial and error using reward signals, but does not explicitly program the algorithm (Source 1).

Additionally, other types of machine learning algorithms mentioned include:

* **Deep Learning**: Uses multiple hidden layers for learning complex hierarchical representations (Source 2).
* **Convolutional Neural Networks (CNNs)**: Excel at image recognition tasks using convolution and pooling operations (Source 2).
* **Recurrent Neural Networks (RNNs)**: Process sequential data like text and time series using memory cells (Source 2).
* **Nat

In [49]:
def bm25_rag(question):
    """Complete BM25 RAG pipeline"""
    print(f"📝 BM25 RAG Pipeline")
    print(f"Question: {question}")
    print("=" * 60)
    
    # Step 1: BM25 keyword search
    print("🔍 Step 1: BM25 keyword search...")
    results = bm25_retriever.get_relevant_documents(question)
    
    # Step 2: Show retrieved documents
    print("📄 Step 2: Retrieved documents:")
    for i, doc in enumerate(results):
        print(f"  {i+1}. {doc.page_content[:80]}...")
    
    # Step 3: Generate answer
    print("🤖 Step 3: Generating answer...")
    answer = generate_answer_from_bm25(question, results)
    
    print(f"\n✅ BM25 Answer: {answer}")
    return answer

# Test complete BM25 RAG
bm25_rag("What is supervised learning?")

📝 BM25 RAG Pipeline
Question: What is supervised learning?
🔍 Step 1: BM25 keyword search...
📄 Step 2: Retrieved documents:
  1. for text understanding and generation. Computer vision algorithms analyze digita...
  2. Machine learning algorithms learn patterns from data to make predictions without...
🤖 Step 3: Generating answer...

✅ BM25 Answer: Supervised learning is a type of machine learning algorithm that uses labeled training data with input-output pairs to train predictive models. In other words, it involves teaching a model what data to expect as output based on the given inputs. The model learns from the patterns and relationships in the data, allowing it to make accurate predictions or classifications without needing explicit programming.


'Supervised learning is a type of machine learning algorithm that uses labeled training data with input-output pairs to train predictive models. In other words, it involves teaching a model what data to expect as output based on the given inputs. The model learns from the patterns and relationships in the data, allowing it to make accurate predictions or classifications without needing explicit programming.'

### Task : RRF - Contextual Rag

#### read and implement 

# Multilingual embeddings


https://huggingface.co/intfloat/multilingual-e5-large