# Tagging and Extraction Using OpenAI functions

In [None]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

In [None]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [None]:
convert_pydantic_to_openai_function(Tagging)

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [None]:
model = ChatOpenAI(temperature=0)

In [None]:
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [None]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [None]:
tagging_chain = prompt | model_with_functions 

In [None]:
tagging_chain.invoke({"input": "I love langchain"})

In [None]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

In [None]:
# Lets parse up the JSON from the AI Message:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

In [None]:
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

In [None]:
tagging_chain.invoke({"input": "Cette nourriture est terrible!"})

# Extraction
## Extraction is similar to tagging, but used for extracting multiple pieces of information

In [None]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [None]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [None]:
convert_pydantic_to_openai_function(Information)

In [None]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [None]:
extraction_model.invoke("Joe is 30, his mom is Martha and she is 65.")

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info and don't provide data that has no info."),
    ("human", "{input}")
])

In [None]:
extraction_chain = prompt | extraction_model

In [None]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha and she is 65."})

In [None]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [None]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha and she is 65."})

In [None]:
# Will look for a particular key in the output and extract only that:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [None]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [None]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha and she is 65."})

# Applying and doing it for real

In [None]:
# Loading a blog post and extracting tag information from a sub-set of the text:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://www.cio.com/article/480408/making-sense-of-sap-rise-4-key-considerations.html")
documents = loader.load()

In [None]:
doc = documents[0]

In [None]:
# Get first 10000 words of the document:
page_content = doc.page_content[:10000]

In [None]:
print(page_content[:1000])

In [None]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [None]:
overview_tagging_function = [
    convert_pydantic_to_openai_function(Overview)
]
tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [None]:
tagging_chain.invoke({"input": page_content})

In [None]:
class Considerations(BaseModel):
    """Information about the key considerations mentioned."""
    key_consideration: str
    summary: str


class Info(BaseModel):
    """Information to extract"""
    key_considerations: List[Considerations]
    

In [None]:
considerations_extraction_function = [
    convert_pydantic_to_openai_function(Info)
]
extraction_model = model.bind(
    functions=considerations_extraction_function, 
    function_call={"name":"Info"}
)
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="key_considerations")

In [None]:
extraction_chain.invoke({"input": page_content})

In [None]:
template = """A article will be passed to you. Extract from it all the key considerations about SAP RISE that are mentioned by this article. 

Provide a clear and concise summary of all key considerations involved. 

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [None]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="key_considerations")

In [None]:
extraction_chain.invoke({"input": page_content})

In [None]:
# Splitting into smaller chunks of text:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [None]:
splits = text_splitter.split_text(doc.page_content)

In [None]:
len(splits)

In [None]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [None]:
flatten([[1, 2], [3, 4]])

In [None]:
print(splits[0])

In [None]:
from langchain.schema.runnable import RunnableLambda

In [None]:
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [None]:
prep.invoke("what is SAP RISE?")

In [None]:
chain = prep | extraction_chain.map() | flatten

In [None]:
chain.invoke(doc.page_content)