# SET UP

In [1]:

import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

In [2]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [3]:
from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field

class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    
    # name: Optional[str] = Field(
    #     default=None, description="The name of the person"
    # )
    name: str = Field(description="Include only a person's first name. Do not include if they have a middle name")
    lastname: Optional[str] = Field(
        default=None, description="The lastname of the person if known"
    )
    country: Optional[str] = Field(
        default=None, description="The country of the person if known"
    )


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
from typing import Optional
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) You can introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        ("human", "{text}"),
    ]
)

In [5]:
chain = prompt | llm.with_structured_output(schema=Person)



In [6]:
import pdfplumber

# Ruta al archivo PDF
ruta_pdf = 'docs/CV_Francisco_M_Martinez_Poyatos_2024.pdf'

# Abrir el archivo PDF
with pdfplumber.open(ruta_pdf) as pdf:
    # Iterar sobre cada página
    for num_pagina, pagina in enumerate(pdf.pages):
        # Extraer texto de la página
        texto = pagina.extract_text()
        #print(f"Texto de la página {num_pagina + 1}:\n{texto}\n")

ModuleNotFoundError: No module named 'pdfplumber'

In [None]:
chain.invoke({"text":texto})

# Prueba con datasets de hugging face

In [None]:
from datasets import load_dataset

ds = load_dataset("TrainingDataPro/ocr-text-detection-in-the-documents")

In [None]:
print(ds['train'][0])

In [None]:
import pdfplumber
import tempfile
from datasets import load_dataset

# Cargar el dataset
ds = load_dataset("TrainingDataPro/ocr-text-detection-in-the-documents")

# Asumimos que el primer elemento tiene un campo con el PDF (ajusta el nombre si es necesario)
pdf_content = ds['train'][0]['pdf']

# Guardar el PDF en un archivo temporal
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
    temp_pdf.write(pdf_content)
    temp_pdf_path = temp_pdf.name

# Abrir el PDF con pdfplumber
with pdfplumber.open(temp_pdf_path) as pdf:
    for num_pagina, pagina in enumerate(pdf.pages):
        texto = pagina.extract_text()
        print(f"Número de página: {num_pagina + 1}")
        print(f"Texto de la página {num_pagina + 1}:\n{texto}\n")



