In [3]:
import gradio as gr
import requests
from bs4 import BeautifulSoup
import PyPDF2
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_ollama.llms import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate  # Importar PromptTemplate

# Función para extraer texto de una URL
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

# Función para extraer texto de un archivo PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Función para dividir el texto en fragmentos
def split_text(text, chunk_size=500):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Cargar el modelo Llama 3.2
llm = OllamaLLM(model="llama3.2:1b", server_url="http://localhost:11434")

# Configuración de embeddings y vectorstore
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory="./vectorstore", embedding_function=embedding_model)

# Crear el PromptTemplate
prompt_template = """
Eres un asistente útil. Usa el contexto a continuación para responder la pregunta del usuario:

{context}

Pregunta: {question}
Respuesta:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Configurar el retriever
retriever = vectorstore.as_retriever()

# Crear el chain de preguntas y respuestas
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

# Función que maneja la carga de PDF, URL y la consulta
def load_and_ask(pdf_file, url, question):
    if not pdf_file and not url:
        return "Por favor, sube un archivo PDF o proporciona una URL."

    # Si se proporciona un PDF
    if pdf_file:
        page_content = extract_text_from_pdf(pdf_file.name)
    # Si se proporciona una URL
    elif url:
        page_content = extract_text_from_url(url)
    
    # Dividir el texto en fragmentos
    chunks = split_text(page_content)
    
    # Convertir los fragmentos en documentos y agregarlos al vectorstore
    documents = [Document(page_content=chunk) for chunk in chunks]
    vectorstore.add_documents(documents)
    
    # Hacer la consulta
    if question:
        response = qa_chain.run(question)
        return response
    else:
        return "Por favor ingresa una pregunta."

# Crear la interfaz con Gradio
iface = gr.Interface(
    fn=load_and_ask,
    inputs=[
        gr.File(label="Cargar PDF (Español)"),
        gr.Textbox(label="Introduce la URL (Inglés)"),
        gr.Textbox(label="Escribe tu pregunta")
    ],
    outputs="text",
    live=True
)

# Iniciar la interfaz
iface.launch()



* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




  response = qa_chain.run(question)
Traceback (most recent call last):
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
    )
    ^
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<11 lines>...
    )
    ^
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/blocks.py", line 2047, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<8 lines>...
    )
    ^
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/blocks.py", line 1594, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^

Created dataset file at: .gradio/flagged/dataset1.csv


Traceback (most recent call last):
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
    )
    ^
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<11 lines>...
    )
    ^
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/blocks.py", line 2047, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<8 lines>...
    )
    ^
  File "/home/alex/miniconda3/envs/rag/lib/python3.13/site-packages/gradio/blocks.py", line 1594, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^