In [1]:
import pandas as pd
import numpy as np
import PyPDF2
import pdfplumber
import openpyxl
import os
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import faiss


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text() + '\n'
    return text


def chunk_text(text, chunk_size=100, overlap=10):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = words[start:end]
        chunks.append(" ".join(chunk))
        start += (chunk_size - overlap)
    return chunks


pdf_path = './files/informe_telefonica.pdf'
raw_text = extract_text_from_pdf(pdf_path)
clean_text = " ".join(raw_text.split())

chunks = chunk_text(clean_text, chunk_size=500, overlap=50)

print("Number of chunks:", len(chunks))
print("First chunk length (words):", len(chunks[0].split()))


Number of chunks: 694
First chunk length (words): 500


In [3]:
embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# embed_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
# embed_model.max_seq_length = 32768
# embed_model.tokenizer.padding_side="right"
# embed_model = SentenceTransformer("ibm-granite/granite-embedding-278m-multilingual")


# Genera embeddings para cada chunk
chunk_embeddings = embed_model.encode(chunks, convert_to_numpy=True)

# Crea un índice FAISS (flat, sin compresión, para simplicidad)
dimension = chunk_embeddings.shape[1]  # dimensión de cada vector
faiss_index = faiss.IndexFlatL2(dimension)

# Añade los embeddings al índice
faiss_index.add(chunk_embeddings)

# Guardamos los chunks en una lista para referencia
chunk_data = chunks  # chunk_data[i] corresponde a chunk_embeddings[i]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:

# np.save("chunk_data.npy", chunk_data, allow_pickle=True)


# chunk_embeddings = np.load("chunk_embeddings.npy")
# dimension = chunk_embeddings.shape[1]
# faiss_index = faiss.IndexFlatL2(dimension)
# faiss_index.add(chunk_embeddings)



In [7]:
excel_path = "./files/Inputs_TRI.xlsx"
wb = openpyxl.load_workbook(excel_path)
ws = wb.active


rows_to_fill = list(ws.iter_rows(min_row=2,
                                 max_col=5,  # hasta la columna E
                                 values_only=False))

In [21]:
model_name = "google/flan-t5-base"

# model_name = "meta-llama/Meta-Llama-3-8B"
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "google/flan-t5-large"


tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def answer_questions(question, context, max_length=128):
    # prompt = f"question: {question}  context: {context}"
    prompt = (
        f"question: {question} "
        "Please answer concisely."
        "Do not copy the entire context verbatim."
        f" context: {context}"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = llm_model.generate(**inputs, max_length=max_length, num_beams=2, early_stopping=True)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True) 
    return answer



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [22]:
def get_relevant_chunks(query, top_k=5):
    # Generar embedding de la query
    query_vector = embed_model.encode([query], convert_to_numpy=True)
    
    # Buscar en el índice
    distances, indices = faiss_index.search(query_vector, top_k)
    # indices es un array con los índices de los top_k vectores más cercanos
    
    relevant_chunks = [chunk_data[i] for i in indices[0]]

    # print("\nRELEVANT CHUNK 1: ", relevant_chunks[0])
    # print("\nRELEVANT CHUNK 2: ", relevant_chunks[1])
    # print("\nRELEVANT CHUNK 3: ", relevant_chunks[2])
    # print("\nRELEVANT CHUNK 4: ", relevant_chunks[3])
    # print("\nRELEVANT CHUNK 5: ", relevant_chunks[4])
    
    return relevant_chunks


In [26]:

# Prueba para una sola pregunta

row = rows_to_fill[7]
tipo_cell, bloque_cell, subbloque_cell, definicion_cell, valor_cell = row
definicion = definicion_cell.value


question = f"Encuentra esta información ({bloque_cell.value}, {subbloque_cell.value}): {definicion}"
query = f"({bloque_cell.value}, {subbloque_cell.value}) {definicion}"

# Obtener los chunks relevantes
relevant_chunks = get_relevant_chunks(query, top_k=10)
context_for_llm = " ".join(relevant_chunks)

# Llamar al LLM con la pregunta + contexto
answer = answer_questions(question, context_for_llm)


print("\nPregunta: ", question, "\nAnswer: ", answer)


Pregunta:  Encuentra esta información (Strategy, Carbon pricing): Internal CO2 price 
Answer:  tCOe 1.811.155 1.329.268 536.737 353.346 337.119 -81 % 2 Alcance 1 + 2 (localización) tCO2e 2.155.701 1.993.719 1.395.404 1.133.998 1.158.997 -46 % Emisiones compensadas3 tCOe 63.018 35.537 33.711 NA 2 Alcance 34 tCOe 2.855.5445 2.855.544 2.072.159 1.930.051 1.970.5


In [27]:

for row in rows_to_fill:
    tipo_cell, bloque_cell, subbloque_cell, definicion_cell, valor_cell = row
    
    definicion = definicion_cell.value
    if not definicion:
        continue
    
    question = f"{definicion} ¿Cuál es el valor según el documento?"
    
    # Obtener los chunks relevantes
    relevant_chunks = get_relevant_chunks(question, top_k=3)
    context_for_llm = " ".join(relevant_chunks)
    
    # Llamar al LLM con la pregunta + contexto
    answer = answer_questions(question, context_for_llm)
    
    # Guardar la respuesta en la celda correspondiente
    valor_cell.value = answer

# Guardamos los cambios al excel
wb.save("datos_completados.xlsx")

print("Done!")
