In [None]:
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import RetrievalQA

# Load the PDF
pdf_path = "sm.pdf"  # Ensure this file exists before running the script
loader = PDFPlumberLoader(pdf_path)
docs = loader.load()

# Instantiate the embedding model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Split into chunks
text_splitter = SemanticChunker(embedder)
documents = text_splitter.split_documents(docs)

# Create the vector store and fill it with embeddings
vector = FAISS.from_documents(documents, embedder)
retriever = vector.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Define llm
llm = Ollama(model="deepseek-r1:1.5b")

# Define the prompt
prompt = """
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.\n
3. Keep the answer crisp and limited to 3,4 sentences.
Context: {context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt)

llm_chain = LLMChain(
    llm=llm,
    prompt=QA_CHAIN_PROMPT,
    callbacks=None,
    verbose=True)

document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context",
    document_prompt=document_prompt,
    callbacks=None)

qa = RetrievalQA(
    combine_documents_chain=combine_documents_chain,
    verbose=True,
    retriever=retriever,
    return_source_documents=True)

# User input
user_input = input("Ask a question related to the PDF: ")

# Process user input
if user_input:
    response = qa(user_input)["result"]
    print("Response:")
    print(response)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.

3. Keep the answer crisp and limited to 3,4 sentences.
Context: Context:
content:Birendra Sharma birendrasharma0226@gmail.com
ML Intern 9817788903
A dedicated and driven student eager to join the workforce and gain lahan-24, lahan, Nepal
practical experience in Machine Learning. Proven ability to complete
tasks efficiently, both independently and collaboratively.
source:Resume.pdf

Context:
content:Dependable, @https://x.com/ardnerib4
reliable, and committed to learning and growing while contributing to
the success of your organization. github.com/birendra-dev
medium.com/@birendrasharma0226
EDUCATION SKILLS
Bachelor of Computer Engineering
Python Flask Django Pandas
Tribhuvan Universit

In [None]:
# # python
# import os

# def load_documents(directory):
#     documents = []
#     for filename in os.listdir(directory):
#         if filename.endswith('.txt'):
#             with open(os.path.join(directory, filename), 'r') as file:
#                 documents.append(file.read())
#     return documents

# documents = load_documents('files')
# # # python
# # %pip install faiss-cpu

# from transformers import AutoTokenizer, AutoModel
# import torch
# import faiss
# import numpy as np

# # Initialize the embeddings model
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# # Function to generate embeddings
# def embed(text):
# 	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
# 	with torch.no_grad():
# 		outputs = model(**inputs)
# 	return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# # Generate embeddings for all documents
# document_embeddings = [embed(doc) for doc in documents]
# document_embeddings = np.array(document_embeddings).astype('float32')

# # Create FAISS index
# index = faiss.IndexFlatL2(document_embeddings.shape[1])  # L2 distance metric
# index.add(document_embeddings)  # Add document embeddings to the index
# # python
# class SimpleRetriever:
#     def __init__(self, index, embed_function):
#         self.index = index
#         self.embed_function = embed_function
    
#     def retrieve(self, query, k=3):
#         query_embedding = self.embed_function(query)
#         distances, indices = self.index.search(np.array([query_embedding]).astype('float32'), k)
#         return [documents[i] for i in indices[0]]

# retriever = SimpleRetriever(index, embed)
# # python
# from langchain_ollama import OllamaLLM
# from string import Template

# # Instantiate the model
# llm = OllamaLLM(model="deepseek-r1:1.5b")

# # Craft the prompt template using string. Template for better readability
# prompt_template = Template("""
# Use ONLY the context below.
# If unsure, say "I don't know".
# Keep answers under 4 sentences.

# Context: $context
# Question: $question
# Answer:
# """)

# # python
# def answer_query(question):
#     # Retrieve relevant context from the knowledge base
#     context = retriever.retrieve(question)
    
#     # Combine retrieved contexts into a single string (if multiple)
#     combined_context = "n".join(context)
    
#     # Generate an answer using DeepSeek R1 with the combined context
#     response = llm.generate(prompt_template.substitute(context=combined_context, question=question))
    
#     return response.strip()
# # python
# if __name__ == "__main__":
#     user_question = "What are the key features of DeepSeek R1?"
#     answer = answer_query(user_question)
#     print("Answer:", answer)