<a href="https://colab.research.google.com/github/AmRo1011/AIE314-Tutorial1/blob/main/Ai_based_Lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install python-docx python-pptx PyMuPDF pandas beautifulsoup4 transformers

In [None]:
from docx import Document
from pptx import Presentation
import pandas as pd
import fitz  # PyMuPDF
from bs4 import BeautifulSoup
import os
import re
import json
from datetime import datetime
from transformers import pipeline

In [None]:
def extract_text_from_docx(file_path):
    # Load the document
    doc = Document(file_path)

    # Extract all text from the document
    full_text = [paragraph.text for paragraph in doc.paragraphs]

    # Join the list into a single string
    return '\n'.join(full_text)

In [None]:
def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text_runs = []

    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text_runs.append(shape.text)

    return '\n'.join(text_runs)

def find_pattern_in_text(text, pattern):
    regex = re.compile(pattern)
    return regex.findall(text)

In [None]:
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ''
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()

    return text

In [None]:
def extract_text_from_excel(excel_path):
    excel_file = pd.ExcelFile(excel_path)
    text = []
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
        text.append(df.to_string(index=False))

    return "\n".join(text)

def extract_text_from_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()

    return text

In [None]:
def get_file_type(file_path):
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    if file_extension == '.docx':
        return 'docx'
    elif file_extension == '.pptx':
        return 'pptx'
    elif file_extension == '.pdf':
        return 'pdf'
    elif file_extension == '.xlsx':
        return 'xlsx'
    elif file_extension == '.html':
        return 'html'
    else:
        return None

def extract_text_from_file(file_path):
    file_type = get_file_type(file_path)

    if file_type == 'docx':
        return extract_text_from_docx(file_path)
    elif file_type == 'pptx':
        return extract_text_from_pptx(file_path)
    elif file_type == 'pdf':
        return extract_text_from_pdf(file_path)
    elif file_type == 'xlsx':
        return extract_text_from_excel(file_path)
    elif file_type == 'html':
        return extract_text_from_html_file(file_path)
    else:
        return "Unsupported file type"

def convert_to_json(original_file_path, extracted_text, output_path=None):
    file_type = get_file_type(original_file_path)

    if output_path is None:
        file_name_without_extension, _ = os.path.splitext(original_file_path)
        output_path = f"{file_name_without_extension}.json"

    extraction_date = datetime.now().isoformat()

    json_data = {
        'file_type': file_type,
        'file_path': original_file_path,
        'extracted_text': extracted_text,
        'extraction_date': extraction_date
    }

    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, ensure_ascii=False, indent=4)

    return output_path

In [None]:
qa_pipeline = pipeline("question-answering")

def answer_question(question, context):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [None]:
file_path = "/content/The Reign of King John.pdf"
extracted_text = extract_text_from_file(file_path)
json_output_path = convert_to_json(file_path, extracted_text)
print(f"Extracted Text:\n{extracted_text}")
print(f"JSON file saved at: {json_output_path}")

question = "What were the reasons for King John's excommunication?"
answer = answer_question(question, extracted_text)
print(f"Answer: {answer}")

Extracted Text:
The Reign of King John (1199–1216) 
Early Years and Ascent to the Throne 
●​
Born in 1166 to King Henry II and Eleanor of Aquitaine 
●​
Known as "Lackland" for his lack of inherited lands 
●​
Secured the throne after the death of his brother Richard I in 1199 
Key Titles and Positions 
1.​ King of England (1199–1216): Consolidated royal authority but faced baronial opposition 
2.​ Duke of Normandy (1199–1204): Lost normandy to Philip II of France 
3.​ Lord of Ireland (1177–1216): Oversaw English interests in Ireland 
4.​ Count of Anjou (1199–1204): Part of the Angevin Empire 
5.​ Duke of Aquitaine (1199–1204): Held through his mother Eleanor 
Notable Achievements and Events 
●​
Administration: Strengthened bureaucracy and taxation systems 
●​
Magna Carta: Forced to sign the charter in 1215, limiting royal power 
●​
Legal Reforms: Established courts and introduced trial by jury 
●​
Military Campaigns: Failed to retake Normandy but succeeded in Ireland 
●​
Church Relation