In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "1719227188415-Prospectus 2024-2025.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
print(len(docs))

In [None]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, chunk_overlap=50, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

In [None]:

from langchain.embeddings import HuggingFaceEmbeddings

# Step 1: Load & split documents
# (Assume you already have `docs`)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

In [None]:
from langchain.vectorstores import FAISS

# Create FAISS vector store
db = FAISS.from_documents(docs, embeddings)

# Save the vectorstore locally
db.save_local("vector_db/college_docs_index")


In [None]:
ids = db.add_documents(documents=all_splits)

In [None]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(
    model_path="model/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
    temperature=0.2,
    max_tokens=256,
    n_ctx=2048,
    verbose=False
)


In [None]:
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate

full_prompt = HumanMessagePromptTemplate.from_template(
    "You are a strict college chatbot. Only answer questions related to college academics, hostel, and admission. Do not answer unrelated questions (e.g., food, movies, games). If the context is not helpful, say: I don't know."
    "Question: {question} \n"
    "Context: {context} \n"
    "Answer:"
)

prompt = ChatPromptTemplate.from_messages([full_prompt])


output_messages = prompt.invoke({
    "question": "What are the eligibility criteria for the M.Tech program?",
    "context": "The M.Tech admissions require a valid GATE score and a bachelor's degree in a relevant engineering stream with at least 60% marks."
}).to_messages()

print(output_messages[0].content)

In [None]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [None]:
def retrieve(state: State):
    docs_with_scores = db.similarity_search_with_score(state["question"], k=3)
    THRESHOLD = 0.75

    filtered_docs = [doc for doc, score in docs_with_scores if score >= THRESHOLD]
    
    if not filtered_docs:
        return {"context":"",
                "answer": "Sorry, that question is not related to college information. Please ask about admissions, hostels, or academics."}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response}

In [None]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
result = graph.invoke({"question": "Is kettle allowed in the hostel?"})
print(result['answer'])