In [8]:
from docx import Document
from pptx import Presentation
from bs4 import BeautifulSoup
from transformers import pipeline
import pdfplumber
import pytesseract
from PIL import Image
import os
import json

# 1. Extract text from different file types
def scrape_file(filepath):
    if filepath.endswith(".docx"):
        doc = Document(filepath)
        return "\n".join([para.text for para in doc.paragraphs])

    elif filepath.endswith(".pptx"):
        prs = Presentation(filepath)
        full_text = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text_frame") and shape.text_frame:
                    full_text.append(shape.text_frame.text)
        return "\n".join(full_text)

    elif filepath.endswith(".html"):
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                html_content = file.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            return soup.get_text(separator=" ", strip=True)
        except Exception as e:
            return f"Error processing HTML file: {e}"

    elif filepath.endswith(".pdf"):
        try:
            with pdfplumber.open(filepath) as pdf:
                return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
        except Exception as e:
            return f"Error processing PDF file: {e}"

    elif filepath.endswith(".png") or filepath.endswith(".jpg") or filepath.endswith(".jpeg"):
        try:
            text = pytesseract.image_to_string(Image.open(filepath))
            return text.strip()
        except Exception as e:
            return f"Error processing image file: {e}"

    elif filepath.endswith(".txt"):
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            return f"Error processing TXT file: {e}"

    else:
        return "Unsupported file type."


# 2. Saves extracted text as a normalized JSON file in the output folder
def save_extracted_text(filepath, extracted_text):
    filename = os.path.basename(filepath).split('.')[0] + ".json"  # Convert filename
    output_path = os.path.join(output_folder, filename)

    data = {"file": filepath, "content": extracted_text}  # JSON structure
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Saved: {output_path}")

# 3. Load the QA model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# 4. Function to answer questions based on extracted text
def answer_question(filepath, question):
    extracted_text = scrape_file(filepath)

    if extracted_text.startswith("Error") or extracted_text == "Unsupported file type.":
        return extracted_text

    # Keep only the first 512 words to fit into the model's context window
    context = " ".join(extracted_text.split()[:512])

    response = qa_pipeline(question=question, context=context)
    return response["answer"]

filepath = "WorldWar.pdf"
extracted_text = scrape_file(filepath)
save_extracted_text(filepath, extracted_text)
question = "What were the main causes of World War I?"
answer = answer_question(filepath, question)
print(answer)

Device set to use cpu


France, Britain, Russia, and later the United States
