In [2]:
import pytesseract
from PIL import Image
import fitz  # PyMuPDF
import cv2
from pyzbar.pyzbar import decode
import numpy as np
from openpyxl import load_workbook
from docx import Document as DocxDocument
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOllama
from langchain.chains import RetrievalQA
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from uuid import uuid4
from email import policy
from email.parser import BytesParser
import difflib
import webbrowser

# Path to your logo
logo_path = r"C:\Users\M\Desktop\RAKBANK.png"

# Global variables
vector_db = None
SUPPORTED_TYPES = ['.pdf', '.jpg', '.jpeg', '.png', '.xlsx', '.xls', '.docx', '.eml', '.txt']
QR_SUPPORTED_TYPES = ['.pdf', '.jpg', '.jpeg', '.png']
HTML_DIFF_CSS = """
<style>
    table.diff {font-family:Courier; border:medium; width:100%;}
    .diff_header {background-color:#e0e0e0}
    td.diff_header {text-align:right}
    .diff_next {background-color:#c0c0c0}
    .diff_add {background-color:#aaffaa}
    .diff_chg {background-color:#ffff77}
    .diff_sub {background-color:#ffaaaa}
</style>
"""

class Document:
    def __init__(self, text, metadata=None):
        self.page_content = text
        self.metadata = metadata or {}

def extract_qr_code(files):
    """Process multiple files for QR codes and open websites"""
    results = []
    opened_urls = set()
    
    for file in files:
        try:
            if file.name.endswith('.pdf'):
                pdf_document = fitz.open(file.name)
                for page_num in range(len(pdf_document)):
                    page = pdf_document.load_page(page_num)
                    pix = page.get_pixmap()
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    
                    open_cv_image = np.array(img)
                    open_cv_image = open_cv_image[:, :, ::-1].copy()
                    
                    decoded_objects = decode(open_cv_image)
                    if decoded_objects:
                        results.append(f"\n📄 {file.name} (Page {page_num+1}):")
                        for obj in decoded_objects:
                            data = obj.data.decode('utf-8')
                            results.append(f"🔗 {data}")
                            if data.startswith(('http://', 'https://')) and data not in opened_urls:
                                webbrowser.open(data)
                                opened_urls.add(data)
                pdf_document.close()
            else:
                img = Image.open(file.name)
                open_cv_image = np.array(img)
                open_cv_image = open_cv_image[:, :, ::-1].copy()
                decoded_objects = decode(open_cv_image)
                
                if decoded_objects:
                    results.append(f"\n📷 {file.name}:")
                    for obj in decoded_objects:
                        data = obj.data.decode('utf-8')
                        results.append(f"🔗 {data}")
                        if data.startswith(('http://', 'https://')) and data not in opened_urls:
                            webbrowser.open(data)
                            opened_urls.add(data)
        
        except Exception as e:
            results.append(f"\n❌ Error processing {file.name}: {str(e)}")
    
    if not results:
        return "No QR codes found in any documents"
    
    if opened_urls:
        results.append("\n\n🌐 Opened URLs:")
        results.extend([f"- {url}" for url in opened_urls])
    
    return "\n".join(results)

def extract_text_from_image(image_path):
    try:
        img = Image.open(image_path)
        return pytesseract.image_to_string(img)
    except Exception as e:
        return f"Image error: {str(e)}"

def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        pdf_document = fitz.open(pdf_path)
        for page in pdf_document:
            text += page.get_text()
        return text
    except Exception as e:
        return f"PDF error: {str(e)}"

def extract_text_from_excel(excel_path):
    try:
        wb = load_workbook(excel_path, data_only=True)
        return '\n'.join(
            ' '.join(str(cell) for cell in row)
            for sheet in wb
            for row in sheet.iter_rows(values_only=True)
        )
    except Exception as e:
        return f"Excel error: {str(e)}"

def extract_text_from_word(word_path):
    try:
        doc = DocxDocument(word_path)
        return '\n'.join(para.text for para in doc.paragraphs)
    except Exception as e:
        return f"Word error: {str(e)}"

def extract_text_from_eml(eml_path):
    try:
        with open(eml_path, 'rb') as fp:
            msg = BytesParser(policy=policy.default).parse(fp)
        text_parts = []
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                text_parts.append(part.get_payload(decode=True).decode('utf-8', errors='replace'))
        return '\n'.join(text_parts)
    except Exception as e:
        return f"EML error: {str(e)}"

def extract_text_from_file(file):
    try:
        if file.name.endswith(".pdf"):
            return extract_text_from_pdf(file.name)
        elif file.name.endswith((".jpg", ".jpeg", ".png")):
            return extract_text_from_image(file.name)
        elif file.name.endswith((".xlsx", ".xls")):
            return extract_text_from_excel(file.name)
        elif file.name.endswith(".docx"):
            return extract_text_from_word(file.name)
        elif file.name.endswith(".eml"):
            return extract_text_from_eml(file.name)
        elif file.name.endswith(".txt"):
            with open(file.name, 'r') as f:
                return f.read()
        else:
            return "Unsupported file type"
    except Exception as e:
        return f"Error: {str(e)}"

def compare_documents(doc1, doc2):
    if not doc1 or not doc2:
        return "Upload both documents"
    
    try:
        text1 = extract_text_from_file(doc1)
        text2 = extract_text_from_file(doc2)
        
        differ = difflib.HtmlDiff()
        return HTML_DIFF_CSS + differ.make_file(
            text1.splitlines(), 
            text2.splitlines(),
            fromdesc="Document 1",
            todesc="Document 2"
        )
    except Exception as e:
        return f"Comparison failed: {str(e)}"

def process_documents(files, urls, checklist_path):
    documents = []
    
    for file in files:
        text = extract_text_from_file(file)
        if text:
            documents.append(Document(text, metadata={"source": file.name}))
    
    for url in urls:
        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                text = soup.get_text(separator=' ')
                documents.append(Document(text, metadata={"source": url}))
        except Exception as e:
            pass
    
    if not documents:
        return "No content found"
    
    try:
        global vector_db
        vector_db = Chroma.from_documents(
            documents=documents,
            embedding=OllamaEmbeddings(model="nomic-embed-text"),
            collection_name=f"collection_{uuid4()}",
            persist_directory=None
        )
        return "Processing successful"
    except Exception as e:
        return f"Processing error: {str(e)}"

def ask_question(question):
    if not vector_db:
        return "Process documents first"
    
    try:
        qa_chain = RetrievalQA.from_chain_type(
            llm=ChatOllama(model="llama3.1"),
            chain_type="stuff",
            retriever=vector_db.as_retriever()
        )
        return qa_chain.run(question)
    except Exception as e:
        return f"QA error: {str(e)}"

def populate_checklist(checklist_path):
    if not vector_db:
        return "Process documents first"
    
    try:
        wb = load_workbook(checklist_path)
        ws = wb.active
        qa_chain = RetrievalQA.from_chain_type(
            llm=ChatOllama(model="llama3.1"),
            chain_type="stuff",
            retriever=vector_db.as_retriever()
        )
        for row_idx in range(2, ws.max_row + 1):
            question = ws.cell(row=row_idx, column=1).value
            if question:
                answer = qa_chain.run(question)
                ws.cell(row=row_idx, column=2).value = answer
        wb.save(checklist_path)
        return "Checklist populated"
    except Exception as e:
        return f"Checklist error: {str(e)}"

with gr.Blocks(title="Document Processor", css=".gradio-container {max-width: 1200px !important}") as iface:
    gr.Image(logo_path, label="", height=100)
    gr.Markdown("<h1 style='text-align: center'>AI Document Processor</h1>")
    
    # QR Code Section
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("## QR Code Scanner")
            qr_upload = gr.Files(label="Upload Documents", 
                                file_types=QR_SUPPORTED_TYPES,
                                file_count="multiple")
            qr_button = gr.Button("Scan All QR Codes", variant="primary")
            qr_output = gr.Textbox(label="Scan Results", lines=8)
    
    # Document Comparison
    with gr.Accordion("Document Comparison Tools", open=False):
        with gr.Row():
            doc1 = gr.File(label="First Document")
            doc2 = gr.File(label="Second Document")
        compare_btn = gr.Button("Compare Documents")
        diff_output = gr.HTML()
    
    # Main Processing
    with gr.Row():
        files = gr.Files(label="Upload Documents", 
                       file_types=SUPPORTED_TYPES,
                       file_count="multiple")
        urls = gr.Textbox(label="Website URLs (comma-separated)",
                        placeholder="Enter URLs separated by commas")
        checklist = gr.Textbox(label="Checklist Path",
                             placeholder="Path to Excel checklist file")
    
    # Actions
    with gr.Row():
        process_btn = gr.Button("Process Documents", variant="primary")
        with gr.Column(scale=2):
            question_input = gr.Textbox(label="Ask a Question",
                                      placeholder="Type your question here...")
            ask_btn = gr.Button("Get Answer")
        checklist_btn = gr.Button("Populate Checklist", variant="secondary")
    
    # Outputs
    with gr.Row():
        status = gr.Textbox(label="Processing Status")
        answer = gr.Textbox(label="Answer")
        checklist_status = gr.Textbox(label="Checklist Status")
    
    # Event Handling
    qr_button.click(extract_qr_code, qr_upload, qr_output)
    compare_btn.click(compare_documents, [doc1, doc2], diff_output)
    process_btn.click(
        lambda f, u, c: process_documents(f, [x.strip() for x in u.split(",") if x.strip()], c),
        [files, urls, checklist],
        status
    )
    ask_btn.click(ask_question, question_input, answer)
    checklist_btn.click(populate_checklist, checklist, checklist_status)

if __name__ == "__main__":
    iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://bfe80514976527f666.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
