In [1]:
# heavily inspired by this course: https://learn.deeplearning.ai/functions-tools-agents-langchain/lesson/5/tagging-and-extraction 

# Tagging and Extraction Using OpenAI functions

In [None]:
# import os
# import openai

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv()) # read local .env file
# openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
from typing import List
from pydantic import BaseModel, Field

class TaggingRefsUrls(BaseModel):
    """Extracting URLs from a paper references section."""
    urls: List[str] = Field(description="A list of urls from the references section of a paper")

In [3]:
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

convert_pydantic_to_openai_function(TaggingRefsUrls)

{'name': 'TaggingRefsUrls',
 'description': 'Extracting URLs from a paper references section.',
 'parameters': {'description': 'Extracting URLs from a paper references section.',
  'properties': {'urls': {'description': 'A list of urls from the references section of a paper',
    'items': {'type': 'string'},
    'title': 'Urls',
    'type': 'array'}},
  'required': ['urls'],
  'title': 'TaggingRefsUrls',
  'type': 'object'}}

In [5]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

model = ChatOpenAI(temperature=0)

tagging_functions = [convert_pydantic_to_openai_function(TaggingRefsUrls)]

prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "TaggingRefsUrls"}
)

In [7]:
tagging_urls_chain = prompt | model_with_functions

tagging_urls_chain

ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Think carefully, and then tag the text as instructed')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))])
| RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1170f0d10>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x11740ca50>, temperature=0.0, openai_api_key='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', openai_proxy=''), kwargs={'functions': [{'name': 'TaggingRefsUrls', 'description': 'Extracting URLs from a paper references section.', 'parameters': {'description': 'Extracting URLs from a paper references section.', 'properties': {'urls': {'description': 'A list of urls from the references section of a paper', 'items': {'type': 'string'}, 'title': 'Urls', 'type': 'array'}}, 'required': ['urls'], 'title': 'TaggingRe

In [8]:
refs_string = """REFERENCES
Howard Chen, Huihan Li, Danqi Chen, and Karthik Narasimhan. Controllable text generation with
language constraints. arXiv preprint arXiv:2212.10466, 2022.
Xinyun Chen, Maxwell Lin, Nathanael Scharli, and Denny Zhou. Teaching large language models ¨
to self-debug. arXiv preprint arXiv:2304.05128, 2023.
Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser,
Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. Training verifiers to
solve math word problems. arXiv preprint arXiv:2110.14168, 2021.
Mingkai Deng, Jianyu Wang, Cheng-Ping Hsieh, Yihan Wang, Han Guo, Tianmin Shu, Meng Song,
Eric Xing, and Zhiting Hu. RLPrompt: Optimizing discrete text prompts with reinforcement
learning. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language
Processing, pp. 3369–3391, Abu Dhabi, United Arab Emirates, December 2022. Association for
Computational Linguistics. doi: 10.18653/v1/2022.emnlp-main.222. URL https://aclanthology.
org/2022.emnlp-main.222.
Chrisantha Fernando, Dylan Banarse, Henryk Michalewski, Simon Osindero, and Tim Rocktaschel. ¨
Promptbreeder: Self-referential self-improvement via prompt evolution. arXiv preprint
arXiv:2309.16797, 2023.
Noah Goodman. Meta-prompt: A simple self-improving language agent, 2023. URL https:
//noahgoodman.substack.com/p/meta-prompt-a-simple-self-improving.
Or Honovich, Uri Shaham, Samuel R. Bowman, and Omer Levy. Instruction induction: From few
examples to natural language task descriptions. In Proceedings of the 61st Annual Meeting of the
Association for Computational Linguistics (Volume 1: Long Papers), pp. 1935–1952, Toronto,
Canada, July 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.acl-long.
108. URL https://aclanthology.org/2023.acl-long.108.
1"""
tagging_urls_chain.invoke({"input": refs_string})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "urls": [\n    "arXiv:2212.10466",\n    "arXiv:2304.05128",\n    "arXiv:2110.14168",\n    "https://aclanthology.org/2022.emnlp-main.222",\n    "arXiv:2309.16797",\n    "https://noahgoodman.substack.com/p/meta-prompt-a-simple-self-improving",\n    "https://aclanthology.org/2023.acl-long.108"\n  ]\n}', 'name': 'TaggingRefsUrls'}})

In [10]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

tagging_urls_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

tagging_urls_chain.invoke({"input": refs_string})

{'urls': ['arXiv:2212.10466',
  'arXiv:2304.05128',
  'arXiv:2110.14168',
  'https://aclanthology.org/2022.emnlp-main.222',
  'arXiv:2309.16797',
  'https://noahgoodman.substack.com/p/meta-prompt-a-simple-self-improving',
  'https://aclanthology.org/2023.acl-long.108']}

## Extraction

Extraction is used for extracting multiple pieces of information.


In [None]:
from typing import Optional

class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

convert_pydantic_to_openai_function(Information)

extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

extraction_model.invoke("Joe is 30, his mom is Martha")

prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

extraction_chain = prompt | extraction_model

extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})