In [None]:
from typing import Any
from unstructured.staging.base import elements_to_json
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
import pytesseract
import pprint
import csv

In [None]:
import os
path = os.getcwd()
print("Current directory:", path)

In [None]:
# prints parent directory
parent_directory = os.path.abspath(os.path.join(path, os.pardir))
print("Parent directory:", parent_directory)

In [None]:
filepath = parent_directory + "/Doc_Panthera/Gestionale/VEN_Contratti_Vendita_Ordini_Aperti.pdf"
print("Filepath:", filepath)

In [None]:
# Get elements
raw_pdf_elements = partition_pdf(
    filename=filepath,
    
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path="Images/pdfImages/",
)

In [None]:
import os
from langchain_openai import ChatOpenAI
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage
import os
from PIL import Image
import base64
import io

In [None]:
image_path = os.getcwd() + "/figures/figure-31-42.jpg"
print("Image path:", image_path)

In [None]:
# Function to convert image to base64
def image_to_base64(image_path):
    with Image.open(image_path) as image:
        buffered = io.BytesIO()
        image.save(buffered, format=image.format)
        img_str = base64.b64encode(buffered.getvalue())
        return img_str.decode('utf-8')

image_str = image_to_base64(image_path)
len(image_str)

In [None]:
chat = ChatOpenAI(model="gpt-4-vision-preview")

prompt1 = "Fornisci una descrizione dell'immagine fornita. Sii chiaro nella spiegazione dei vari campi della sezione principale della videata e i possibili utilizzi."

msg = chat.invoke(
    [
        HumanMessage(
            content=[
                {"type": "text", 
                 "text" : prompt1},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_str}"
                    },
                },
            ]
        )
    ]
)

# Extraction with placeholder

In [None]:
pdf_path = parent_directory + "/Doc_Panthera/Gestionale/VEN_Contratti_Vendita_Ordini_Aperti.pdf"
print("Filepath:", pdf_path)

In [None]:
import os
import pdfplumber
from PIL import Image

file_name = os.path.basename(pdf_path)
print(file_name)
#txt_output_path = "estrazione_testo.txt"
txt_output_path = "Augmented_" + file_name
images_folder = "Images"  # Folder to save images

# Create the Images folder if it doesn't exist
os.makedirs(images_folder, exist_ok=True)

# Open the PDF
with pdfplumber.open(pdf_path) as pdf:
    extracted_text = ""
    image_counter = 1  # Counter for image naming

    # Loop through each page in the PDF
    for i, page in enumerate(pdf.pages):
        # Extract text
        page_text = page.extract_text()
        if page_text:
            extracted_text += page_text + "\n"

        # Extract images
        for img in page.images:
            # Define the file name for the image, saving it in the Images folder
            img_name = os.path.join(images_folder, f"image_{image_counter}.png")
            
            # Calculate the region of the image
            x0, y0, x1, y1 = img["x0"], img["top"], img["x1"], img["bottom"]
            
            # Extract and save the image
            image = page.within_bbox((x0, y0, x1, y1)).to_image()
            image.save(img_name, format="PNG")
            
            # Add a placeholder in the extracted text
            extracted_text += f"[IMAGE: {img_name}]\n"
            
            # Increment the counter
            image_counter += 1

# Save the extracted text with placeholders to a text file
with open(txt_output_path, "w", encoding="utf-8") as txt_file:
    txt_file.write(extracted_text)

print("Extraction complete. Text has been saved, and images have been extracted to the 'Images' folder with placeholders.")


In [None]:
import os
import base64
import io
from PIL import Image
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

# Initialize the ChatOpenAI model
chat = ChatOpenAI(model="gpt-4-vision-preview")

# Folder containing images
image_folder = "Images"

# Function to convert an image to base64
def image_to_base64(image_path):
    with Image.open(image_path) as image:
        buffered = io.BytesIO()
        image.save(buffered, format=image.format)
        img_str = base64.b64encode(buffered.getvalue())
        return img_str.decode('utf-8')

# Dictionary to store the results
results = {}

# Loop through all files in the image folder
for filename in os.listdir(image_folder):
    image_path = os.path.join(image_folder, filename)
    
    # Check if the file is an image (optional: filter by extensions if necessary)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        # Convert image to base64
        image_str = image_to_base64(image_path)
        
        # Send the image to the model
        msg = chat.invoke(
            [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": prompt1
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{image_str}"
                            },
                        },
                    ]
                )
            ]
        )
        
        # Store the response using the filename as the key
        results[filename] = msg.content

# Output the results
for image_id, description in results.items():
    print(f"{image_id}: {description}")

In [None]:
# CSV output file
import csv
csv_output_path = "image_descriptions.csv"

# Output the results to a CSV file
with open(csv_output_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Image ID", "Description"])  # Write header
    for image_id, description in results.items():
        writer.writerow([image_id, description])

# Print confirmation
print(f"Descriptions have been saved to {csv_output_path}")

In [None]:
# Regular expression pattern to match image placeholders like [IMAGE: Images/image_x.png]
import re

# File paths
text_file_path = "estrazione_testo.txt"
# Load the text from the file
with open(text_file_path, "r", encoding="utf-8") as file:
    text = file.read()

pattern = r"\[IMAGE: Images/(image_\d+\.png)\]"

# Function to replace the image placeholders with their descriptions or empty string if description starts with "Mi dispiace"
def replace_image_placeholders(match):
    image_id = match.group(1)  # Extract the image filename
    description = results.get(image_id, None)  # Get the description from the dictionary
    
    # If the description exists and starts with "Mi dispiace", return an empty string instead
    if description and description.startswith("Mi dispiace"):
        return ""  # Replace with empty string if description starts with "Mi dispiace"
    
    # Otherwise, return the description or leave the placeholder if description is missing
    return description if description else f"[IMAGE: Images/{image_id}]"

# Replace placeholders in the text
updated_text = re.sub(pattern, replace_image_placeholders, text)

print("Updated Text:\n")
print(updated_text)

# Save the updated text to a new file
with open("updated_estrazione_testo.txt", "w", encoding="utf-8") as file:
    file.write(updated_text)

print("Text updated with image descriptions and saved to 'updated_estrazione_testo.txt'.")

In [None]:
# Save the updated text to a new CSV file with file_name and updated_text as columns
csv_name = "Augmented_dataset.csv"
with open(csv_name, "w", encoding="utf-8", newline="") as csvfile:
    csv_writer = csv.writer(csvfile, delimiter="|")
    csv_writer.writerow(["file_name", "updated_text"])  # Write header
    csv_writer.writerow([file_name, updated_text])  # Write data row

print("Text updated with image descriptions and saved to 'updated_text.csv'.")

# Retrieval

In [None]:
# Load the original dataset for RAG
import pandas as pd

csv_name = "Augmented_dataset.csv"
data_df = pd.read_csv(csv_name, names = ['file_name', 'updated_text'], delimiter="|")
data_df = data_df.drop(index = 0)
data_df

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import DataFrameLoader
import pprint

loader = DataFrameLoader(data_df, page_content_column="updated_text")
docs_data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)

In [None]:
# Split documents and keep track of chunk numbers within each document
splits = []
for doc in docs_data:
    # Split each document into chunks
    doc_chunks = text_splitter.split_documents([doc])
    
    # Add chunk number as metadata
    for chunk_num, chunk in enumerate(doc_chunks):
        chunk.metadata["chunk_number"] = chunk_num + 1  # Adding 1 to start counting from 1
        splits.append(chunk)

# Print the first few splits with chunk numbers
pprint.pprint(splits[0:6])
pprint.pprint(len(splits))

In [None]:
pprint.pprint(splits[1].page_content)

In [None]:
from FlagEmbedding import BGEM3FlagModel
model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
from langchain_community.vectorstores import FAISS

embd = M3EmbeddingFP16()
vectorstore = FAISS.from_documents(documents=splits, embedding=embd)
vectorstore.save_local("trial_VEN_index")

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
question = "Come posso decidere se nel calcolo della percentuale di saturazione del contratto vadano considerate anche la quantità in previsione?"
retrieved_docs = retriever.invoke(question)
retrieved_docs

# Generation

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Evita troppe ripetizioni nella risposta fornita.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:
# LLM - the used model
model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# max_token

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Question
pprint.pprint(rag_chain.invoke(question))

In [None]:
# LLM - the used model
model = ChatOpenAI(model_name="gpt-4o", temperature=0)
# max_token

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Question
pprint.pprint(rag_chain.invoke(question))

In [None]:
pprint.pprint(question)

In [None]:
from langchain_ollama import ChatOllama

model_llama = ChatOllama(
    model="llama3.2",
    temperature=0
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

# Question
pprint.pprint(rag_chain.invoke(question))

In [None]:
from langchain_ollama.llms import OllamaLLM
model_llama_instruct = OllamaLLM(model="llama3.2:3b-instruct-fp16", temperature=0)

rag_chai = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama_instruct
    | StrOutputParser()
)

pprint.pprint(rag_chain.invoke(question))

# Try synthetic data generation

In [None]:
from huggingface_hub import InferenceClient
import json

token_pro = os.getenv('HUGGINGFACE_TOKEN')
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
    token=token_pro
)

def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1024},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["generated_text"]

In [None]:
QA_generation_prompt = """
Il tuo compito è scrivere una domanda e una risposta data un contesto.
La tua domanda deve essere rispondibile con un'informazione specifica dal contesto. Se nel contesto ci sono errori grammaticali o morfologici correggili nell'output fornito.
La tua domanda deve essere formulata nello stesso stile delle domande che gli utenti potrebbero porre ad un helpdesk, che si occupa di assistenza clienti per un software aziendale.
Questo significa che la tua domanda NON deve menzionare frasi come "secondo il passaggio" o "nel contesto". 
La tua domanda può menzionare frasi come "Ho un errore" o "Come posso sistemare il problema".

Domanda e risposta devono essere generate in italiano.

Fornisci la tua risposta come segue:

Output:::
Domanda: (la tua domanda)
Risposta: (la tua risposta alla domanda)

Ora ecco il contesto.

Contesto: {context}\n
Output:::"""


In [None]:
import random
from tqdm.auto import tqdm

N_GENERATIONS = 5

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(splits, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = call_llm(llm_client, QA_generation_prompt.format(context=sampled_context.page_content))
    try:
        question = output_QA_couple.split("Domanda: ")[-1].split("Risposta: ")[0]
        answer = output_QA_couple.split("Risposta: ")[-1]

        outputs.append(
            {
                "question": question,
                "answer": answer,
                "context": sampled_context.page_content,
                "source_doc": sampled_context.metadata["file_name"],
            }
        )
    except:
        continue

In [None]:
generated_questions = pd.DataFrame.from_dict(outputs)
display(generated_questions)

In [None]:
generated_questions.iloc[0]['question'], generated_questions.iloc[0]['answer']