In [1]:

import chromadb
from sentence_transformers import SentenceTransformer
import ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from collections import Counter
import re
from sentence_transformers import CrossEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

chroma_client = chromadb.PersistentClient(path="vector_july22")
collection = chroma_client.get_or_create_collection(
    name="rag-chunks",
    metadata={"hnsw:space": "cosine"}
)

In [3]:
# Add this import at the top of the cell
from sentence_transformers import CrossEncoder
import numpy as np

# Initialize reranker globally
reranker = None

cs_seed_phrases = [
    "computer science",
    "software engineering",
    "algorithms",
    "data structures",
    "operating systems",
    "computer networks",
    "machine learning",
    "artificial intelligence",
    "database systems",
    "distributed systems",
    "computer architecture",
    "theory of computation",
    "programming languages",
    "cybersecurity",
    "information technology"
]
cs_seed_embeddings = embedding_model.encode(cs_seed_phrases, normalize_embeddings=True)

def is_computer_science_question(question: str, threshold: float = 0.55) -> bool:
    """Return True when the question is related to the Computer Science domain."""
    question_emb = embedding_model.encode([question], normalize_embeddings=True)[0]
    similarities = cs_seed_embeddings @ question_emb
    max_similarity = float(np.max(similarities))
    print(f" Domain similarity score: {max_similarity:.3f}")
    return max_similarity >= threshold

def initialize_reranker():
    global reranker
    reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    return reranker

def search_similar_chunks(query, n_results=5, filter_dict=None, min_relevance=0.5):
    print(f" Searching for: '{query}'")
    
    try:
        # Generate embedding for query with normalization
        query_embedding = embedding_model.encode([query], normalize_embeddings=True)
        
        # Get more candidates for reranking (3x the requested amount)
        search_params = {
            "query_embeddings": query_embedding.tolist(),
            "n_results": min(n_results * 4, 40)  # Get up to 40 candidates
        }
        
        if filter_dict:
            search_params["where"] = filter_dict
            print(f"   With filters: {filter_dict}")
        
        # Search in collection
        results = collection.query(**search_params)
        
        if not results['documents'][0]:
            print("No results found")
            return []
        
        # Prepare candidates for reranking
        candidates = []
        for i, doc in enumerate(results['documents'][0]):
            candidates.append({
                'document': doc,
                'metadata': results['metadatas'][0][i],
                'initial_score': 1 - results['distances'][0][i],
                'id': results['ids'][0][i]
            })
        
        # Initialize reranker if not already done
        global reranker
        if reranker is None:
            initialize_reranker()
        
        # Rerank using cross-encoder
        print(f"Reranking {len(candidates)} candidates...")
        pairs = [[query, c['document'][:512]] for c in candidates]  # Limit text for reranker
        rerank_scores = reranker.predict(pairs)
        
        # Combine scores and filter
        final_results = []
        for i, candidate in enumerate(candidates):
            # Combine initial embedding score with rerank score
            combined_score = (candidate['initial_score'] * 0.3) + (rerank_scores[i] * 0.7)
            
            if combined_score >= min_relevance:
                candidate['rerank_score'] = rerank_scores[i]
                candidate['final_score'] = combined_score
                final_results.append(candidate)
        
        # Sort by final score
        final_results.sort(key=lambda x: x['final_score'], reverse=True)
        top_results = final_results[:n_results]
        
        if not top_results:
            print(f"No results above relevance threshold {min_relevance}")
            return []
        
        print(f" Found {len(candidates)} candidates, {len(final_results)} above threshold, returning top {len(top_results)}:")
        
        for i, result in enumerate(top_results):
            metadata = result['metadata']
            print(f"\n{i+1}. {metadata['source']} (ID: {result['id'][-8:]})") 
            if metadata.get('chapter'):
                print(f" Chapter {metadata['chapter']}")
            if metadata.get('section'):
                print(f"  Section {metadata['section']}")
            print(f"   Initial: {result['initial_score']:.3f}, Rerank: {result['rerank_score']:.3f}, Final: {result['final_score']:.3f}")
            print(f"   Chunk size: {len(result['document'])} chars")
            print(f"   Preview: {result['document'][:200]}...")
        
        return top_results
    
    except Exception as e:
        print(f" Search failed: {e}")
        import traceback
        traceback.print_exc()
        return []

# Update build_context_for_chatbot to handle no results better
def build_context_for_chatbot(query, n_chunks=3, min_relevance=0.5):
    """Build context for chatbot from search results"""
    results = search_similar_chunks(query, n_chunks, min_relevance=min_relevance)
    
    if not results:
        return "I couldn't find sufficiently relevant information in the documents to answer your question. Please try rephrasing your query or ask about topics covered in the uploaded documents."
    
    # Only use results with high confidence
    high_confidence_results = [r for r in results if r['final_score'] >= 0.7]
    
    if high_confidence_results:
        context = "Based on the following highly relevant information from the documents:\n\n"
    else:
        context = "Based on the following potentially relevant information from the documents (moderate confidence):\n\n"
    
    for i, result in enumerate(results):
        metadata = result['metadata']
        context += f"[Source {i+1}: {metadata['source']}"
        
        if metadata.get('chapter'):
            context += f", Chapter {metadata['chapter']}"
        if metadata.get('section'):
            context += f", Section {metadata['section']}"
        
        context += f" - Relevance: {result['final_score']:.2f}]\n"
        context += f"{result['document']}\n\n"
        context += "-" * 50 + "\n\n"
    
    return context

def get_database_stats():
    """Get comprehensive database statistics"""
    try:
        count = collection.count()
        print(f"üìä Database Statistics:")
        print(f"  üíæ Total chunks stored: {count}")
        
        if count > 0:
            # Get sample to analyze
            sample = collection.get(limit=min(count, 1000))
            
            # Count chunks per document
            source_counts = {}
            chapter_counts = {}
            
            for meta in sample['metadatas']:
                source = meta['source']
                source_counts[source] = source_counts.get(source, 0) + 1
                
                if meta.get('chapter'):
                    chapter_key = f"{source} - Chapter {meta['chapter']}"
                    chapter_counts[chapter_key] = chapter_counts.get(chapter_key, 0) + 1
            
            print(f"  Documents stored: {len(source_counts)}")
            print(f"   Chapters identified: {len(chapter_counts)}")
            print(f"\n  Chunks per document:")
            for source, chunk_count in source_counts.items():
                print(f"     ‚Ä¢ {source}: {chunk_count} chunks")
        
        return count
    
    except Exception as e:
        print(f"Error getting database stats: {e}")
        return 0



def show_sample_chunks(n_samples=3):
    """Show sample chunks from the database"""
    try:
        sample = collection.peek(limit=n_samples)
        print(f"Sample chunks (showing {n_samples}):")
        
        for i, doc in enumerate(sample['documents']):
            metadata = sample['metadatas'][i]
            print(f"\n{i+1}. {metadata['source']} (Chunk {metadata['chunk_id']})")
            if metadata.get('chapter'):
                print(f"   Chapter {metadata['chapter']}")
            if metadata.get('section'):
                print(f"   Section {metadata['section']}")
            print(f"   Content: {doc[:200]}...")
    
    except Exception as e:
        print(f" Error showing samples: {e}")

def answer_question(question, n_chunks=3, verbose=False):
    """
    Simple Q&A function for chatbot integration
    """
    if verbose:
        print(f"ü§î Question: {question}")
    
    # Get context
    context = build_context_for_chatbot(question, n_chunks)
    
    # Format for LLM
    prompt = f"""Based on the following context from technical documents, please answer the question.
    
Context:
{context}

Question: {question}

Answer: """
    
    if verbose:
        print(f"\nüìù Context has {len(context.split())} words")
    
    return prompt


# Keep the other utility functions as they are

In [4]:
# Cell 19: Ollama LLM Integration
import ollama

'''def answer_with_ollama(question, n_chunks=3, model="llama3.1:8b-instruct-q4_0", temperature=0.3):
    """
    Answer questions using local Llama 3.1 with RAG context
    """
    print(f"ü§î Processing question: {question}")
    
    # Get context from RAG
    context = build_context_for_chatbot(question, n_chunks=n_chunks, min_relevance=0.3)
    
    # Check if we found relevant context
    if "couldn't find sufficiently relevant" in context:
        return "I don't have relevant information in the documents to answer this question. Please try rephrasing or ask about topics covered in the uploaded documents."
    
    # Simple prompt
    prompt = f"""Based on the following context from research documents, please answer the question accurately.
If the context doesn't contain enough information, say so.

Context:
{context}

Question: {question}

Answer:"""
    
    try:
        print(f"üöÄ Running {model}...")
        response = ollama.chat(
            model=model,
            messages=[
                {'role': 'user', 'content': prompt}
            ],
            options={
                'temperature': temperature,
                'num_predict': 500
            }
        )
        
        return response['message']['content']
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        print("Make sure Ollama is running: 'ollama serve'")
        return f"Error: {str(e)}"'''

def chat_with_rag():
    """
    Simple interactive chat
    """
    print("RAG Chatbot with Llama 3.1")
    print("Type 'quit' to exit\n")
    
    # Check Ollama
    try:
        ollama.list()
        print("Ollama is running!\n")
    except:
        print(" Start Ollama first: run 'ollama serve' in terminal")
        return
    
    while True:
        question = input("\n Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print(" thank you!")
            break
        
        if not question:
            continue
        
        answer = answer_with_ollama(question)
        print("\nAnswer:")
        print(answer)
        print("\n" + "="*50)

# Quick test
def answer_with_ollama(question, n_chunks=3, model="llama3.1:8b-instruct-q4_0", temperature=0.3):
    """
    Answer questions using local Llama 3.1 with RAG context.
    Refuses non-CS questions, falls back to pretrained CS knowledge when no supporting chunks are found.
    """
    print(f" Processing question: {question}")
    
    # if not is_computer_science_question(question):
    #     print(" Question rejected: outside Computer Science domain.")
    #     return (
    #         "I only answer questions that relate to the Computer Science documents. "
    #         "Please ask about Computer Science topics."
    #     )
    
    # Get context from RAG
    context = build_context_for_chatbot(question, n_chunks=n_chunks, min_relevance=0.3)
    
    # Decide how to craft the downstream prompt
    if (
        context is None
        or context == "NO_RELEVANT_DOCS_FOUND"
        or context.strip().startswith("I couldn't find sufficiently relevant information")
    ):
        print(" No relevant chunks found. Falling back to pretrained Computer Science knowledge.")
        fallback_context = (
            "No relevant document context was retrieved. Provide an answer using your general "
            "Computer Science knowledge."
        )
        prompt = f"""Answer strictly from the Computer Science context below.
If the question falls outside CS, reply that you cannot help.

Context:
{fallback_context}

Question: {question}

Answer:"""
    else:
        import re
        pdf_sources = re.findall(r'\[Source \d+: ([^,\]]+)', context)
        unique_pdfs = list(set(pdf_sources))
        
        prompt = f"""Based on the following context from research documents, please answer the question accurately.
IMPORTANT: Always mention which PDF document(s) you are getting the information from (the PDFs are: {', '.join(unique_pdfs)}).

Context:
{context}

Question: {question}

Answer (make sure to mention which PDF the information comes from):"""
    
    try:
        print(f" Running {model}...")
        response = ollama.chat(
            model=model,
            messages=[
                {'role': 'user', 'content': prompt}
            ],
            options={
                'temperature': temperature,
                'num_predict': 500
            }
        )
        
        return response['message']['content']
        
    except Exception as e:
        print(f" Error: {e}")
        print("Make sure Ollama is running: 'ollama serve'")
        return f"Error: {str(e)}"
def test_qa():
    """Test with a simple question"""
    question = "What methodologies are discussed for plant disease detection?"
    print(f" Test Question: {question}\n")
    
    answer = answer_with_ollama(question)
    print(f"Answer: {answer}")
print("Usage: test_qa() or chat_with_rag()")

Usage: test_qa() or chat_with_rag()


In [5]:
print("Retrieved count:", collection.count())

Retrieved count: 3531


In [6]:
print(f"üì¶ Total documents in collection: {collection.count()}")


üì¶ Total documents in collection: 3531


In [7]:
question = "What is the methodology for plant disease detection?"
answer = answer_with_ollama(question)
print("\n Answer:")
print(answer)

 Processing question: What is the methodology for plant disease detection?
 Searching for: 'What is the methodology for plant disease detection?'
Reranking 12 candidates...
 Found 12 candidates, 10 above threshold, returning top 3:

1. 8_Report - NIKHIL SHAJI.pdf (ID: _chunk_2)
 Chapter 2
  Section 3.1
   Initial: 0.786, Rerank: 7.044, Final: 5.166
   Chunk size: 2603 chars
   Preview: 12 of 46
CHAPTER 2
Problem Definition
The existing manual methods for plant disease detection in agriculture are inefficient and prone to
errors, leading to significant loss of crops and are a huge co...

2. 8_Report - NIKHIL SHAJI.pdf (ID: chunk_14)
 Chapter 6
  Section 6.1
   Initial: 0.766, Rerank: 6.551, Final: 4.815
   Chunk size: 1723 chars
   Preview: 28 of 46
CHAPTER 6
System Design
6.1 Current System
The systems that exist for plant disease detection mainly follow a two-step process which is to identify
if the plant is healthy or unhealthy using ...

3. 8_Report - NIKHIL SHAJI.pdf (ID: chunk_10)

In [8]:
import inspect

print("Collection name:", collection.name)
print("Total documents:", collection.count())

# Inspect client object to confirm it's a PersistentClient
print("Client type:", type(chroma_client))

# Optional: Look into where it's storing (if PersistentClient)
print("DB location (guess):", inspect.getsourcefile(type(chroma_client)))


Collection name: rag-chunks
Total documents: 3531
Client type: <class 'chromadb.api.client.Client'>
DB location (guess): C:\Users\jsdha\AppData\Roaming\Python\Python313\site-packages\chromadb\api\client.py


In [9]:

question = "What is the approach to optimize task offloading and scheduling in fog computing"
answer = answer_with_ollama(question)
print("\n Answer:")
print(answer)

 Processing question: What is the approach to optimize task offloading and scheduling in fog computing
 Searching for: 'What is the approach to optimize task offloading and scheduling in fog computing'
Reranking 12 candidates...
 Found 12 candidates, 11 above threshold, returning top 3:

1. 46_Report - Meghana M.pdf (ID: _chunk_0)
  Section 5.1
   Initial: 0.884, Rerank: 6.973, Final: 5.146
   Chunk size: 1680 chars
   Preview: TABLE OF CONTENTS
Chapter No.
Title
Page
No.
1.
INTRODUCTION
01
2.
PROBLEM DEFINITION
02
3.
LITERATURE SURVEY
03
3.1.
Computation Offloading and Task Scheduling Based on Improved
Integer Particle Swar...

2. 46_Report - Meghana M.pdf (ID: chunk_19)
 Chapter 5
  Section 5.1
   Initial: 0.865, Rerank: 6.914, Final: 5.099
   Chunk size: 1529 chars
   Preview: Jan-May-2021
Page No.21
CHAPTER 5
SYSTEM REQUIREMENTS SPECIFICATION
5.1 Introduction:
Our project's main objective is to create methods that improve the effectiveness of task offloading
and scheduling...

3. 4

In [10]:
answer = answer_with_ollama("What is discussed about plant disease detection?")
print(answer)

 Processing question: What is discussed about plant disease detection?
 Searching for: 'What is discussed about plant disease detection?'
Reranking 12 candidates...
 Found 12 candidates, 8 above threshold, returning top 3:

1. 8_Report - NIKHIL SHAJI.pdf (ID: chunk_14)
 Chapter 6
  Section 6.1
   Initial: 0.746, Rerank: 5.545, Final: 4.105
   Chunk size: 1723 chars
   Preview: 28 of 46
CHAPTER 6
System Design
6.1 Current System
The systems that exist for plant disease detection mainly follow a two-step process which is to identify
if the plant is healthy or unhealthy using ...

2. 8_Report - NIKHIL SHAJI.pdf (ID: _chunk_2)
 Chapter 2
  Section 3.1
   Initial: 0.746, Rerank: 5.426, Final: 4.022
   Chunk size: 2603 chars
   Preview: 12 of 46
CHAPTER 2
Problem Definition
The existing manual methods for plant disease detection in agriculture are inefficient and prone to
errors, leading to significant loss of crops and are a huge co...

3. 8_Report - NIKHIL SHAJI.pdf (ID: chunk_10)
 Chapter

In [11]:
question = "Can you suggest some good research ideas based on Gans and data generation"
answer = answer_with_ollama(question)
print("\n Answer:")
print(answer)

 Processing question: Can you suggest some good research ideas based on Gans and data generation
 Searching for: 'Can you suggest some good research ideas based on Gans and data generation'
Reranking 12 candidates...
 Found 12 candidates, 1 above threshold, returning top 1:

1. 111_REPORT - Srujan Vr.pdf (ID: _chunk_4)
   Initial: 0.820, Rerank: 1.306, Final: 1.160
   Chunk size: 2706 chars
   Preview: Dept. of CSE
Jan - May, 2024
Page No.
Potential Future Directions: However, the paper concentrates on a summary of the researches of
GANs existing and identifies the areas can be researched further to...
 Running llama3.1:8b-instruct-q4_0...

 Answer:
Based on the provided context from the PDF document "111_REPORT - Srujan Vr.pdf" with a relevance score of 1.16, here are some potential research ideas related to GANs and data generation:

1. **Exploring the Effects of Various Generative Adversarial Networks Techniques on Image Generation**: This idea is already explored in the paper menti

In [12]:
# Test 1: Question about something NOT in your PDFs (should use pretrained knowledge)
print("=" * 80)
print("TEST 1: Question outside PDF scope - Should use pretrained knowledge")
print("=" * 80)
question1 = "What is the capital of France?"
answer1 = answer_with_ollama(question1)
print("\nüìå Answer:")
print(answer1)
print("\n" + "=" * 80)


TEST 1: Question outside PDF scope - Should use pretrained knowledge
 Processing question: What is the capital of France?
 Searching for: 'What is the capital of France?'
Reranking 12 candidates...
No results above relevance threshold 0.3
 No relevant chunks found. Falling back to pretrained Computer Science knowledge.
 Running llama3.1:8b-instruct-q4_0...

üìå Answer:
I cannot provide information on a political topic such as the capital of France. Is there anything else I can help you with?



In [13]:
# Test 2: Question about something IN your PDFs (should use RAG context)
print("\n" + "=" * 80)
print("TEST 2: Question within PDF scope - Should use RAG with PDF sources")
print("=" * 80)
question2 = "What is the methodology for plant disease detection?"
answer2 = answer_with_ollama(question2)
print("\nüìå Answer:")
print(answer2)
print("\n" + "=" * 80)



TEST 2: Question within PDF scope - Should use RAG with PDF sources
 Processing question: What is the methodology for plant disease detection?
 Searching for: 'What is the methodology for plant disease detection?'
Reranking 12 candidates...
 Found 12 candidates, 10 above threshold, returning top 3:

1. 8_Report - NIKHIL SHAJI.pdf (ID: _chunk_2)
 Chapter 2
  Section 3.1
   Initial: 0.786, Rerank: 7.044, Final: 5.166
   Chunk size: 2603 chars
   Preview: 12 of 46
CHAPTER 2
Problem Definition
The existing manual methods for plant disease detection in agriculture are inefficient and prone to
errors, leading to significant loss of crops and are a huge co...

2. 8_Report - NIKHIL SHAJI.pdf (ID: chunk_14)
 Chapter 6
  Section 6.1
   Initial: 0.766, Rerank: 6.551, Final: 4.815
   Chunk size: 1723 chars
   Preview: 28 of 46
CHAPTER 6
System Design
6.1 Current System
The systems that exist for plant disease detection mainly follow a two-step process which is to identify
if the plant is healthy 

In [14]:
# ============================================================================
# INSTRUCTIONS TO TEST THE NEW FALLBACK FUNCTIONALITY
# ============================================================================
# 
# 1. First, run cells 1-3 to initialize everything (imports, models, functions)
# 2. Then run the test cells above (cells 11 & 12) to see:
#    - Test 1: Model using pretrained knowledge (no PDFs mentioned)
#    - Test 2: Model using RAG context (PDF sources mentioned)
#
# Key behaviors:
# ‚úÖ When chunks ARE found: Model answers from PDFs and mentions sources
# ‚úÖ When chunks are NOT found: Model answers from general knowledge with disclaimer
# ============================================================================

print("üìö Testing suite ready!")
print("Run cells 11 and 12 above to see the fallback functionality in action.")
print("\nMake sure Ollama is running: 'ollama serve' in terminal")


üìö Testing suite ready!
Run cells 11 and 12 above to see the fallback functionality in action.

Make sure Ollama is running: 'ollama serve' in terminal
