In [1]:
import re
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange





In [2]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

In [3]:
# Function to clean extracted text
def clean_text(text):
    # Remove newlines and unnecessary whitespace
    text = re.sub(r'\n+', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters
    text = re.sub(r'[^\w\s,.]', '', text)
    return text.strip()

In [4]:
# Function to split text into smaller chunks
def split_text(text, max_length=512):
    sentences = text.split('. ')
    chunks, chunk = [], []
    for sentence in sentences:
        if len(' '.join(chunk)) + len(sentence) <= max_length:
            chunk.append(sentence)
        else:
            chunks.append(' '.join(chunk))
            chunk = [sentence]
    if chunk:
        chunks.append(' '.join(chunk))
    return chunks

In [6]:
# Load and process the PDF
pdf_path = "./History_Ancient_Medieval_Nepal.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(pdf_text)
text_chunks = split_text(cleaned_text)

In [7]:
# Using SentenceTransformer for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for each text chunk
embeddings = embedding_model.encode(text_chunks)


In [16]:
index = faiss.IndexFlatL2(embeddings.shape[1])

In [15]:
# Function to search for the nearest chunks in FAISS
def search_in_faiss(query, top_k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return indices


In [20]:
# Define the prompt template
prompt_template = PromptTemplate(
   template = """
You are an assistant having the knowledge of the given PDF context, 
and you will try to answer as possible to the given context of the PDF. 
If you don't know the answer, don't try to make it up yourself.
Context: {context}
Question: {question}
Answer: 
"""
,input_variables=["context", "question"]
)

In [32]:
from langchain.llms.base import LLM
class SimpleHuggingFaceLLM:
    def __init__(self, model_name: str):
        self.pipeline = pipeline("text-generation", model=model_name)
    
    def generate(self, prompt: str) -> str:
        result = self.pipeline(prompt, max_length=100, num_return_sequences=1)
        return result[0]['generated_text']

In [31]:
# Initialize the custom Hugging Face model wrapper
llm = SimpleHuggingFaceLLM("distilbert-base-cased")

# Define a function to use the custom LLM for generating answers
def llm_answer(context: str, question: str) -> str:
    prompt = prompt_template.render(context=context, question=question)
    return llm.generate(prompt)

# Create the LLMChain with the prompt template and the LLM
llm_chain = LLMChain(prompt_template=prompt_template, llm=llm)

ValueError: "HuggingFaceLLM" object has no field "model_name"

In [25]:
# # Initialize the Hugging Face LLM
# llm_chain = LLMChain(prompt_template=prompt_template, llm=pipeline("text-generation", model="distilbert-base-cased"))

In [10]:
# Initialize Hugging Face QA pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Function to answer a question using QA model
def answer_question(question, context):
    return qa_pipeline(question=question, context=context)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [11]:
# Chatbot loop to engage in conversation
conversation_history = []

In [17]:
def chatbot_loop():
    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit"]:
            break

        # Retrieve relevant chunks based on the user query
        search_indices = search_in_faiss(user_input)
        relevant_chunks = [text_chunks[i] for i in search_indices[0]]

        # Generate an answer using the QA model
        answer = answer_question(user_input, ' '.join(relevant_chunks))

        # Print the response
        print("Chatbot:", answer['answer'])

        # Update conversation history
        conversation_history.append({"question": user_input, "answer": answer['answer']})


In [18]:
# Start chatbot loop
chatbot_loop()

You: What is Nepal
Chatbot: Tibet
You: Who is the king of nepal
Chatbot: Gupta
You: Who is the King of Nepal
Chatbot: Rajyabati
You: stop
Chatbot: 6g69 Ranipokhari 038rft
You: quit
