In [1]:
import numpy as np
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import fitz  # PyMuPDF
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK punkt tokenizer is downloaded
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

# Use the path where you extracted GloVe vectors
glove_file = './glove.6B/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file)


Loaded 400000 word vectors.


In [3]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load FLAN-T5 model (pre-downloaded)
custom_model_dir = "./flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(custom_model_dir, local_files_only=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(custom_model_dir, local_files_only=True)


In [14]:

# Function to generate an answer or summary using FLAN-T5
def generate_answer_with_flan(query, context):
    inputs = tokenizer(f"Context: {context} Question: {query}", return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(**inputs, max_length=150, num_beams=5, early_stopping=True)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Function to get the GloVe vector of a word
def get_glove_vector(word, glove_embeddings, embedding_dim=300):
    return glove_embeddings.get(word, np.zeros(embedding_dim))

# Convert a sentence into a vector by averaging word vectors
def sentence_to_glove_vector(sentence, glove_embeddings, embedding_dim=300):
    words = nltk.word_tokenize(sentence.lower())
    word_vectors = [torch.tensor(get_glove_vector(word, glove_embeddings, embedding_dim), device=device) for word in words if word in glove_embeddings]
    
    if not word_vectors:
        return torch.zeros(embedding_dim, device=device)
    
    return torch.mean(torch.stack(word_vectors), dim=0)

# Convert the entire document to GloVe vectors (sentence-level)
def document_to_glove_vectors(document_sentences, glove_embeddings, embedding_dim=300):
    vectors = []
    for sentence in document_sentences:
        vector = sentence_to_glove_vector(sentence, glove_embeddings, embedding_dim)
        if vector.norm() > 0:
            vectors.append(vector)
    return vectors

# Function to find the most relevant sentences from the document based on the query
def find_relevant_context(query, document_vectors, document_sentences, glove_embeddings):
    query_vector = sentence_to_glove_vector(query, glove_embeddings)
    
    if len(document_vectors) == 0:
        return "No relevant context found."
    
    document_vectors_tensor = torch.stack(document_vectors).to(device)
    similarities = torch.nn.functional.cosine_similarity(query_vector.unsqueeze(0), document_vectors_tensor)
    
    top_indices = torch.topk(similarities, k=5).indices
    top_context = " ".join([document_sentences[i] for i in top_indices])
    
    return top_context

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Main function for chatbot
def chatbot_with_glove_and_flan(pdf_path, glove_embeddings):
    document_text = extract_text_from_pdf(pdf_path)
    if not document_text.strip():
        print("Extracted text is empty. Please check the PDF file.")
        return
    
    document_sentences = sent_tokenize(document_text.replace("\n"," "))
    if not document_sentences:
        print("No sentences found in the document.")
        return
    
    document_vectors = document_to_glove_vectors(document_sentences, glove_embeddings)
    
    if len(document_vectors) == 0:
        print("No valid document vectors found. Exiting.")
        return
    
    print("Chatbot is ready! Ask your questions.\n")
    
    while True:
        query = input("You: ")
        if query.lower() in ['exit', 'quit', 'end']:
            break
        
        relevant_context = find_relevant_context(query, document_vectors, document_sentences, glove_embeddings)
        answer = generate_answer_with_flan(query, relevant_context)
        print(f"Chatbot: {answer}\n")


In [15]:

# Example usage
pdf_path = "D:\sem-7\Blockchain\Theory\Introduction to Blockchain.pdf"
chatbot_with_glove_and_flan(pdf_path, glove_embeddings)


  pdf_path = "D:\sem-7\Blockchain\Theory\Introduction to Blockchain.pdf"


Chatbot is ready! Ask your questions.



Chatbot: What is the name of the project?

