In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import NLTKTextSplitter
import os

# Path to folder with PDF files
docs_dir = "/home/ubuntu/Documents/IITB_RAG/iitb_docs"

# Load all PDF files using PyPDFLoader
documents = []
for filename in os.listdir(docs_dir):
    if filename.endswith(".pdf"):
        filepath = os.path.join(docs_dir, filename)
        loader = PyPDFLoader(filepath)
        documents.extend(loader.load())  # Extend, not append!

# Now use NLTKTextSplitter for sentence-based splitting
text_splitter = NLTKTextSplitter(chunk_size=3, chunk_overlap=1)
docs = text_splitter.split_documents(documents)

print(f"Loaded and split {len(docs)} chunks.")

Created a chunk of size 75, which is longer than the specified 3
Created a chunk of size 40, which is longer than the specified 3
Created a chunk of size 90, which is longer than the specified 3
Created a chunk of size 28, which is longer than the specified 3
Created a chunk of size 319, which is longer than the specified 3
Created a chunk of size 41, which is longer than the specified 3
Created a chunk of size 77, which is longer than the specified 3
Created a chunk of size 244, which is longer than the specified 3
Created a chunk of size 12, which is longer than the specified 3
Created a chunk of size 14, which is longer than the specified 3
Created a chunk of size 15, which is longer than the specified 3
Created a chunk of size 258, which is longer than the specified 3
Created a chunk of size 165, which is longer than the specified 3
Created a chunk of size 131, which is longer than the specified 3
Created a chunk of size 146, which is longer than the specified 3
Created a chunk of 

Loaded and split 875 chunks.


In [2]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings  # updated import

# Initialize the embedding model (MiniLM is good for speed + quality)
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Create FAISS vector store from your docs
vectorstore = FAISS.from_documents(docs, embedding_model)

# Save the vector store locally
vectorstore.save_local("faiss_index")


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
!wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf -P models/


--2025-05-08 11:27:25--  https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2600:9000:2634:f600:17:b174:6d00:93a1, 2600:9000:2634:bc00:17:b174:6d00:93a1, 2600:9000:2634:3400:17:b174:6d00:93a1, ...
Connecting to huggingface.co (huggingface.co)|2600:9000:2634:f600:17:b174:6d00:93a1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/repos/b0/ca/b0cae82fd4b3a362cab01d17953c45edac67d1c2dfb9fbb9e69c80c32dc2012e/08a5566d61d7cb6b420c3e4387a39e0078e1f2fe5f055f3a03887385304d4bfa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27llama-2-7b-chat.Q4_K_M.gguf%3B+filename%3D%22llama-2-7b-chat.Q4_K_M.gguf%22%3B&Expires=1746686660&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NjY4NjY2MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9iMC9jYS9iMGNhZTgyZmQ0YjNhMzYyY2FiMDFkMTc5NTNjNDVlZGFjNjdkMWMyZGZiOWZiYjllNjljODBjMzJkYzIwMTJlLzA4YTU1NjZkNjFkN2NiNmI0MjBjM2U0Mzg3YTM5ZTAwNzhlMWYyZmU1ZjA1NWYzYTAzODg3Mzg1MzA0ZDRiZmE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Si

In [3]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(
    model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
    n_ctx=4096,
    max_tokens=512,
    n_threads=6,
    temperature=0.7
)


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32         

In [4]:
from langchain.tools import DuckDuckGoSearchRun

# Initialize retriever from FAISS vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# Initialize web search tool
web_search = DuckDuckGoSearchRun()


In [5]:
from sentence_transformers import CrossEncoder

# Initialize reranker
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_documents(query: str, documents: list) -> list:
    # Prepare pairs for reranking
    pairs = [[query, doc.page_content] for doc in documents]
    scores = reranker.predict(pairs)
    # Sort documents by score
    ranked_docs = [doc for _, doc in sorted(zip(scores, documents), key=lambda x: x[0], reverse=True)]
    return ranked_docs[:3]  # Return top 3 documents


In [6]:
from langchain.schema import Document

class FakeRetriever:
    def __init__(self, docs: list[Document]):
        self.docs = docs

    def get_relevant_documents(self, query: str):
        return self.docs

    def invoke(self, input, config=None):
        # input is expected to be the query string
        return self.get_relevant_documents(input)


In [7]:
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory

# Initialize RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Memory to store past conversation
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Function to handle query generation and response
def generate_answer(query: str) -> str:
    docs = retriever.get_relevant_documents(query)
    top_docs = rerank_documents(query, docs)
    
    # Inject reranked docs into QA chain
    original_retriever = qa_chain.retriever
    qa_chain.retriever = FakeRetriever(top_docs)
    answer = qa_chain.run(query)
    qa_chain.retriever = original_retriever

    return answer


  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [8]:
from langchain.schema import Document

def convert_results_to_documents(results):
    # Assuming the results is a string that contains important details.
    # For simplicity, here we split the text and assign title and content
    return [
        Document(
            page_content=result.strip(),  # Use the entire text or extract a part of it
            metadata={"title": "Unknown Title", "source": "Unknown Source"}  # You can improve this extraction later
        )
        for result in results.split("\n")  # Split text into different lines for processing each part
    ]

In [9]:
def generate_web_answer(query: str) -> str:
    results = web_search.run(query, num_results=10)
    docs = convert_results_to_documents(results)
    top_docs = rerank_documents(query, docs)
    
    original_retriever = qa_chain.retriever
    qa_chain.retriever = FakeRetriever(top_docs)
    answer = qa_chain.run(query)
    qa_chain.retriever = original_retriever

    return answer


In [10]:
from langchain.agents import initialize_agent, Tool
from langchain.agents.agent_types import AgentType
# Define tools
tools = [
    Tool(name="WebSearch", func=web_search.run, description="Search the web for information.")
]
# Initialize the agent with memory
agent = initialize_agent(
    tools, 
    llm, 
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 
    verbose=True,
    memory=memory  # Keep memory of past conversations
)



  agent = initialize_agent(


In [14]:
def smart_agent_run(query):
    local_answer = generate_answer(query)
    
    # If local answer is good, use it
    if local_answer and "i don't know" not in local_answer.lower() and len(local_answer.strip()) > 20:
        return local_answer
    print("Fall back to Web Search")
    web_answer = generate_web_answer(query)
    return web_answer
    
    # Else delegate to agent (which can pick tools itself)
    #return agent.run(query)

# Usage
response = smart_agent_run("How is academic standing determined based on CPI and failed courses?")
print(response)
#If a student has a CPI of 8.2 and no backlog, what will be their academic standing?

Llama.generate: 350 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   27291.56 ms
llama_perf_context_print: prompt eval time =   12039.58 ms /     2 tokens ( 6019.79 ms per token,     0.17 tokens per second)
llama_perf_context_print:        eval time =  176930.47 ms /   132 runs   ( 1340.38 ms per token,     0.75 tokens per second)
llama_perf_context_print:       total time =  179524.19 ms /   134 tokens


 The academic standing of a student is determined based on their CPI, which takes into account all courses they have taken and earned grades for. If a student has no backlog courses (failed courses that have not been cleared) and their CPI is equal to or greater than 8.0, they are in Category I, which is the highest academic standing category. However, if a student has failed courses, their CPI will reflect the failed status until those courses are cleared. Once a course is cleared by earning a pass grade on a subsequent registration, the CPI will only reflect the new grade earned and not the previous fail grades.


In [18]:
query = "What all technical clubs are there in IITB?"
results = web_search.run(query,num_results = 10)
print(results)

Campus of IIT Bombay is beautiful. Nostalgia hits every time you enter through the main entrance. ... and the lessons I have learnt here will go a long way. Maximum students here are quite nice. There are a lot of cultural clubs, technical clubs, sport clubs, etc. We of course cannot forget Asia's largest institute cultural fest 'Mood Indigo ... The oldest and best established clubs in IIT Bombay, the Quiz Club meets every Wednesday at 9:00 p. m. in the Hostel-8 lounge. ... From cultural festivals like Mood Indigo to technical competitions like Techfest, there's never a dull moment on campus. The plethora of clubs and societies cater to a wide range of interests, ensuring that every ... We need all clubs to step up and disseminate information about your club for easy understanding of everyone. You may have excellent presence in social media but those are all shortlived. Also, there is not much tech activity in the hostels and the tech culture is declining over the years, and these acti