In [2]:
import os
import PyPDF2
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Check if GPU is available, and use it if possible
device = 0 if torch.cuda.is_available() else -1  # -1 means CPU, 0 means GPU

# Initialize the model and tokenizer from Hugging Face
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Initialize the QA pipeline with the correct device
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)

# Directory containing PDF files
pdf_directory = "pdf"

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

# Function to load all PDFs in a directory
def load_pdfs_from_directory(directory):
    pdf_texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            pdf_text = extract_text_from_pdf(pdf_path)
            if pdf_text.strip():
                pdf_texts.append(pdf_text)
    return pdf_texts

# Function to answer a question using the PDF contents
def answer_question(question, pdf_texts):
    # Combine all PDF text into one context
    context = " ".join(pdf_texts)
    
    # Use the QA pipeline to answer the question
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Main interaction loop
def chat_with_pdf_bot():
    pdf_texts = load_pdfs_from_directory(pdf_directory)
    if not pdf_texts:
        print("No PDFs found or extracted text is empty.")
        return
    
    print("PDF-based Legal Chatbot is ready! You can ask questions.")
    print("Type 'exit' to quit the chat.")
    
    while True:
        # User asks a question
        question = input("You: ")
        if question.lower() == 'exit':
            print("Goodbye!")
            break
        
        # Answer the question based on PDF content
        answer = answer_question(question, pdf_texts)
        print(f"Bot: {answer}")

# Run the chatbot
if __name__ == "__main__":
    chat_with_pdf_bot()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PDF-based Legal Chatbot is ready! You can ask questions.
Type 'exit' to quit the chat.
Bot: elevations
Bot: elevations
Goodbye!
