 # Setup

In [None]:
%%capture
%pip install "unstructured[all_docs]" unstructured_client watermark

In [None]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [None]:
from IPython.display import JSON

import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements, elements_to_json

import pprint

In [None]:
%load_ext watermark

In [None]:
import unstructured

In [None]:
%watermark --iversions

In [None]:
import os
path = os.getcwd()
print("Current directory:", path)

In [None]:
# prints parent directory
parent_directory = os.path.abspath(os.path.join(path, os.pardir))
filepath = parent_directory + "/Doc_Panthera/Gestionale/VEN_Contratti_Vendita_Ordini_Aperti.pdf"
print("Filepath:", filepath)

In [None]:
import unstructured_client

client = unstructured_client.UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
    server_url="https://api.unstructuredapp.io",
)

# Exploration of unstructure elements

In [None]:
from unstructured.partition.pdf import partition_pdf

#partition_pdf?? - to learn about the possible parameters
# Now, elements is a list of all elements present in the pages of the parsed pdf document
# Set a strategy to extract Images
elements = partition_pdf(filepath, strategy="hi_res", languages=['ita'])

In [None]:
element_dict = [el.to_dict() for el in elements]
output = json.dumps(element_dict, indent=2)
print(output)

In [None]:
element_dict = [el.to_dict() for el in elements]

unique_types = set()

for item in element_dict:
    unique_types.add(item['type'])

print(unique_types)

In [None]:
images = [el for el in elements if el.category == "Image"]

print(images[40].text)
print(images[40].metadata.text_as_html)

In [None]:
len(images)

# Extract not only information from the images but the images

In [None]:
output_dir = parent_directory + "/Experiments/Extracted_Images"
raw_pdf_elements = partition_pdf(filename=filepath,
                                 # Unstructured first finds embedded image blocks
                                 # Only applicable if `strategy=hi_res`
                                 extract_images_in_pdf=True,
                                 strategy = "hi_res",
                                 infer_table_structure=True,
                                 # Only applicable if `strategy=hi_res`
                                 extract_image_block_output_dir = output_dir)

# Show extracted images

In [None]:
from IPython.display import Image, display
import glob

# Define the path to the folder containing the images
folder_path = output_dir + "/*.jpg"  # Update the file type as needed

# Use glob to search for JPG files in the specified folder
image_files = glob.glob(folder_path)

# Iterate through the list of image files and display each image inline
#for image_file in image_files:
#    display(Image(filename=image_file))

# Image extraction and placeholder

In [None]:
from unstructured.partition.pdf import partition_pdf
import json

# Parse the PDF
elements = partition_pdf(filepath, strategy="hi_res", languages=['ita'])

# Prepare the output structure and image metadata storage
processed_elements = []
image_metadata = {}

for el in elements:
    el_dict = el.to_dict()  # Convert element to a dictionary
    el_type = el_dict.get("type", None)  # Get the type of the element

    if el_type == "Image":
        # Create a placeholder for the image
        placeholder = f"[IMAGE: {el_dict['element_id']}]"
        processed_elements.append(placeholder)
        
        # Save the image metadata for future processing
        image_metadata[el_dict['element_id']] = el_dict
    else:
        # For other types, keep the text as is
        processed_elements.append(el_dict.get("text", ""))

# Combine the text and placeholders into a single output
output_text = "\n".join(processed_elements)

# Save the image metadata for later use
with open("image_metadata.json", "w", encoding="utf-8") as f:
    json.dump(image_metadata, f, indent=2)

# Print the text output with placeholders
print(output_text)

# Optionally save the text output
with open("processed_output.txt", "w", encoding="utf-8") as f:
    f.write(output_text)

# Multi-modal llava image explanation

In [None]:
%%capture
%pip install langchain langchain_core langchain_community

In [None]:
from langchain_community.llms import Ollama

llm = Ollama(model = "llava:13b")

In [None]:
import base64
from io import BytesIO

from IPython.display import HTML, display
from PIL import Image


def convert_to_base64(pil_image):
    """
    Convert PIL images to Base64 encoded strings

    :param pil_image: PIL image
    :return: Re-sized Base64 string
    """

    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")  # You can change the format if needed
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


def plt_img_base64(img_base64):
    """
    Display base64 encoded string as image

    :param img_base64:  Base64 string
    """
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))


file_path = output_dir + "/figure-31-42.jpg"
pil_image = Image.open(file_path)
image_b64 = convert_to_base64(pil_image)
plt_img_base64(image_b64)

In [None]:
prompt = """Analyze the provided image, which shows a section of the user interface of a management software. 
Your task is to:

1. List each field name exactly as it appears in the image, without translating or altering the names.
2. Provide a neutral description of the functionality and available options for each field or element, based solely on what can be observed from the interface. 
Avoid inventing or speculating about additional functionalities.
3. Clearly describe the user interactions possible with the options, buttons, or fields. For example, whether dropdown menus are present, checkboxes can be selected, or buttons move elements between lists.
4. Maintain the original language (Italian) for field names and options where applicable.

Ensure that the explanation is accurate, clear, and concise, focusing on usability and interaction details visible in the image"""

llm_with_image_context = llm.bind(images=[image_b64])
text = llm_with_image_context.invoke(prompt)

In [None]:
pprint.pprint(text)

In [None]:
from transformers import pipeline

# Initialize the translation pipeline
translator = pipeline("translation", model="facebook/mbart-large-50-many-to-many-mmt")

# Translate the text (e.g., from English to French)
translated_text = translator(text, src_lang="en_XX", tgt_lang="it_IT", max_length = 512)

# Print the translated output
print("Translated Text:", translated_text[0]['translation_text'])

In [None]:
pprint.pprint(translated_text[0]['translation_text'])