# Extraction with OpenAI Tools

Performing extraction has never been easier! OpenAI's tool calling ability is the perfect thing to use as it allows for extracting multiple different elements from text that are different types.

Models after 1106 use tools and support "parallel function calling" which makes this super easy.

In [None]:
!pip install langchain openai --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency 

In [None]:
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [None]:
from typing import List, Optional

from langchain.chains.openai_tools import create_extraction_chain_pydantic
from langchain.chat_models import ChatOpenAI
from langchain.pydantic_v1 import BaseModel

In [None]:
# Make sure to use a recent model that supports tools
model = ChatOpenAI(model="gpt-3.5-turbo-1106")

In [None]:
# Pydantic is an easy way to define a schema
class Person(BaseModel):
    """Information about people to extract."""

    name: str
    age: Optional[int] = None

In [None]:
chain = create_extraction_chain_pydantic(Person, model)

In [None]:
chain.invoke({"input": "jane is 2 and bob is 3"})

[Person(name='jane', age=2), Person(name='bob', age=3)]

In [None]:
# Let's define another element
class Class(BaseModel):
    """Information about classes to extract."""

    teacher: str
    students: List[str]

In [None]:
chain = create_extraction_chain_pydantic([Person, Class], model)

In [None]:
chain.invoke({"input": "jane is 2 and bob is 3 and they are in Mrs Sampson's class"})

[Person(name='jane', age=2),
 Person(name='bob', age=3),
 Class(teacher='Mrs Sampson', students=['jane', 'bob'])]

e.g. San Francisco 49ers Wikipedia Doc

In [None]:
# !pip install wikipedia --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


In [None]:
# load a wikipedia page into txt file to pass to chain
from langchain.document_loaders import WikipediaLoader

docs = WikipediaLoader(query="San Francisco 49ers", load_max_docs=2).load()
docs[0].page_content[:49]

'The San Francisco 49ers (also written as the San '

In [None]:
len(docs[0].page_content)

4000

In [None]:
docs[1].page_content[:49]

"The 2022 season was the San Francisco 49ers' 73rd"

In [None]:
chain.invoke({"input": docs[0].page_content})

[Person(name='Tony Morabito', age=47),
 Person(name='Victor Morabito', age=45),
 Class(teacher='Buck Shaw', students=['Frankie Albert', 'Dicky Moegle'])]

## Under the hood

Under the hood, this is a simple chain:

```python
from typing import Union, List, Type, Optional

from langchain.output_parsers.openai_tools import PydanticToolsParser
from langchain.utils.openai_functions import convert_pydantic_to_openai_tool
from langchain.schema.runnable import Runnable
from langchain.pydantic_v1 import BaseModel
from langchain.prompts import ChatPromptTemplate
from langchain.schema.messages import SystemMessage
from langchain.schema.language_model import BaseLanguageModel

_EXTRACTION_TEMPLATE = """Extract and save the relevant entities mentioned \
in the following passage together with their properties.

If a property is not present and is not required in the function parameters, do not include it in the output."""  # noqa: E501


def create_extraction_chain_pydantic(
    pydantic_schemas: Union[List[Type[BaseModel]], Type[BaseModel]],
    llm: BaseLanguageModel,
    system_message: str = _EXTRACTION_TEMPLATE,
) -> Runnable:
    if not isinstance(pydantic_schemas, list):
        pydantic_schemas = [pydantic_schemas]
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_message),
        ("user", "{input}")
    ])
    tools = [convert_pydantic_to_openai_tool(p) for p in pydantic_schemas]
    model = llm.bind(tools=tools)
    chain = prompt | model | PydanticToolsParser(tools=pydantic_schemas)
    return chain
```

In [None]:
# create an extraction chain for american football data
from langchain.chains.openai_tools import create_extraction_chain_pydantic

sf49ers2023 = WikipediaLoader(query="2023_San_Francisco_49ers_season", load_max_docs=2).load()

class GameSummaries(BaseModel):
    """Information about each week's game."""

    week: str
    opponent: str
    result: Optional[str] = None
    date: Optional[str] = None
    time: Optional[str] = None
    weather: Optional[str] = None
    attendance: Optional[int] = None

niner_players = create_extraction_chain_pydantic(GameSummaries, model)

extracted_summaries = []
for i in range(len(sf49ers2023)):
    try:
        extracted_summaries.append(
            niner_players.invoke({"input": sf49ers2023[i].page_content})
        )
    except:
        print(f"Skipping {i}")
        continue
extracted_summaries

[[GameSummaries(week='1', opponent='Pittsburgh Steelers', result=None, date=None, time=None, weather=None, attendance=None)],
 [GameSummaries(week='16', opponent='Washington Commanders', result='win', date=None, time=None, weather=None, attendance=None),
  GameSummaries(week='15', opponent='Seattle Seahawks', result='win', date=None, time=None, weather=None, attendance=None),
  GameSummaries(week='14', opponent='Tampa Bay Buccaneers', result=None, date=None, time=None, weather=None, attendance=None),
  GameSummaries(week='13', opponent='Miami Dolphins', result=None, date=None, time=None, weather=None, attendance=None),
  GameSummaries(week='12', opponent='New Orleans Saints', result=None, date=None, time=None, weather=None, attendance=None),
  GameSummaries(week='11', opponent='Arizona Cardinals', result=None, date=None, time=None, weather=None, attendance=None),
  GameSummaries(week='10', opponent='Los Angeles Chargers', result=None, date=None, time=None, weather=None, attendance=None

It's only able to get week and opponent for 2023 byt it is able to grab the result from last year's (2022) week 16 matchup. As of 12/5/23 only week 1 through 13 has occured.

---

Music Data Extraction Example

In [None]:
# sample data creation
import pandas as pd

# Creating the data for the table
data = {
    "Artist": ["Tame Impala"] * 25,
    "Album": ["Currents"] * 13 + ["The Slow Rush"] * 12,
    "Song": ["Let It Happen", "Nangs", "The Moment", "Yes Im Changing", "Eventually", "Gossip", "The Less I Know the Better", "Past Life", "Disciples", "Cause Im a Man", "Reality in Motion", "Love/Paranoia", "New Person, Same Old Mistakes"]
    + ["One More Year", "Instant Destiny", "Borderline", "Posthumous Forgiveness", "Breathe Deeper", "Tomorrows Dust", "On Track", "Lost in Yesterday", "Is It True", "It Might Be Time", "Glimmer", "One More Hour"]
}

# Creating the pandas DataFrame
df = pd.DataFrame(data)

# df to text file
df_str = df.to_string()
print(df_str)

         Artist          Album                           Song
0   Tame Impala       Currents                  Let It Happen
1   Tame Impala       Currents                          Nangs
2   Tame Impala       Currents                     The Moment
3   Tame Impala       Currents                Yes Im Changing
4   Tame Impala       Currents                     Eventually
5   Tame Impala       Currents                         Gossip
6   Tame Impala       Currents     The Less I Know the Better
7   Tame Impala       Currents                      Past Life
8   Tame Impala       Currents                      Disciples
9   Tame Impala       Currents                 Cause Im a Man
10  Tame Impala       Currents              Reality in Motion
11  Tame Impala       Currents                  Love/Paranoia
12  Tame Impala       Currents  New Person, Same Old Mistakes
13  Tame Impala  The Slow Rush                  One More Year
14  Tame Impala  The Slow Rush                Instant Destiny
15  Tame

In [None]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

import json

# musical system message
musical_system_message = """A text document will be passed to you. Extract from it all musical information about the artist, album and songs that are mentioned in this document.
Do not extract the name of the article itself. If no artist, album or songs are mentioned that's fine - you don't need to extract any! Just return an empty list.
Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

# musical system message
musical_prompt = ChatPromptTemplate.from_messages([("system", musical_system_message), ("human", "{input}")])
musical_prompt

# Function output schema
class Album(BaseModel):
    """Information about musical album."""

    album_name: str = Field(description="album name")
    album_songs: List[str] = Field(description="songs")

class Artist(BaseModel):
    """Information about musical artist."""

    artist_name: str = Field(description="artist name")
    albums: Optional[List[Album]] = Field(description="albums")


# Make sure to use a recent model that supports tools
model = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Function definition
function = [convert_pydantic_to_openai_function(Artist)]
chain = (
    musical_prompt
    | model.bind(functions=function, function_call={"name": "Artist"})
    | (
        lambda x: json.loads(x.additional_kwargs["function_call"]["arguments"])
    )
)

extracted_response = chain.invoke({"input": df_str})

In [None]:
extracted_response

{'artist_name': 'Tame Impala',
 'albums': [{'album_name': 'Currents',
   'album_songs': ['Let It Happen',
    'Nangs',
    'The Moment',
    'Yes Im Changing',
    'Eventually',
    'Gossip',
    'The Less I Know the Better',
    'Past Life',
    'Disciples',
    'Cause Im a Man',
    'Reality in Motion',
    'Love/Paranoia',
    'New Person, Same Old Mistakes']},
  {'album_name': 'The Slow Rush',
   'album_songs': ['One More Year',
    'Instant Destiny',
    'Borderline',
    'Posthumous Forgiveness',
    'Breathe Deeper',
    'Tomorrows Dust',
    'On Track',
    'Lost in Yesterday',
    'Is It True',
    'It Might Be Time',
    'Glimmer',
    'One More Hour']}]}