# Build an Extraction Chain
본 튜토리얼에서는 비정형 텍스트로부터 정형 데이터를 추출하는 실습을 진행합니다.

In [1]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model = "gpt-3.5-turbo")

## The Schema
- 우선 텍스트에서 어떤 정보를 추출할지 정해야한다.
- Pydantic을 사용해서 개인정보를 추출한다


In [2]:
from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field

class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )

스키마를 정의하기 위해서는 두가지 방법이 있다
1. 속성(attribute)와 스키마 자체를 문서화합니다
    이 정보는 LLM에 전달되어서 정보의 퀄리티를 높이는데 사용됩니다. (부연 설명 역할)
2. LLM이 새로운 정보를 생성하도록 만들면 안됩니다. 위에서는 속성에 대해서 Optional을 사용해서 답을 모르는 경우 LLM이 None을 반환하도록 했습니다.

# Extractor

In [3]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of attribute asked to extract, "
            "return null for the attribute's value",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human","{text}"),
    ]
)

이제 function/tool calling을 지원하는 모델을 사용해야한다.

In [6]:
runnable = prompt | llm.with_structured_output(schema=Person)

In [8]:
text = "Alan Smith is 6 feet tall and has blond hair."
runnable.invoke({"text":text})

Person(name='Alan Smith', hair_color='blond', height_in_meters='1.83')

# Multiple Entities
- 대부분의 경우에는 하나의 엔티티보다는 여러 개의 엔티티를 추출하게 된다.
- 'pydantic'으로 모델들을 내부에서 서로 중첩하여 해결한다.

In [9]:
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field

class Person(BaseModel):
    """Information about a person."""
    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default = None, description="The name of the person")
    hair_color: Optional[str] = Field(
        defualt=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )

class Data(BaseModel):
    """Extracted data about people."""
    # Creates a model so that we can extract multiple entities.
    people: List[Person]

In [10]:
runnable = prompt | llm.with_structured_output(schema=Data)
text = "My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me."
runnable.invoke({"text": text})

Data(people=[Person(name='Jeff', hair_color='black', height_in_meters='1.83'), Person(name='Anna', hair_color='black', height_in_meters=None)])