# KOR practise


## SETUP


- https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/Expert%20Structured%20Output%20(Using%20Kor).ipynb -> greg notebook
- https://github.com/eyurtsev/kor -> github repo for KOR
- https://eyurtsev.github.io/kor/tutorial.html -> KOR documentation
- https://www.youtube.com/watch?v=xZzvwR9jdPA -> video as how KOR is used by Greg on youtube


In [35]:
# kor
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

# langchain
from langchain_openai import ChatOpenAI
from langchain.llms import OpenLM

# standard helpers
import pandas as pd
import requests
import time
import json
from datetime import datetime

# text helpers
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# for token counting
from langchain.callbacks import get_openai_callback

In [36]:
def printOutput(output):
    print(json.dumps(output, sort_keys=True, indent=3))

In [37]:
# get openai api key
from dotenv import find_dotenv, load_dotenv
import os

_ = load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# OPENAI_API_KEY

# llm
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    # model_name="gpt-4",
    temperature=0,
    max_tokens=2000,
    openai_api_key=OPENAI_API_KEY,
)

llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x11fd14530>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x11fd18da0>, temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='', max_tokens=2000)

## KOR - HELLO WORLD


In [38]:
# create an object that holds the info about the fields we need to extract
person_schema = Object(
    id="person",  # this is the parent in the output
    # describe about this object
    description="Personal information about a person",
    # what fields to capture from the text
    attributes=[
        Text(id="first-name", description="The first name of a person.")  # key
    ],
    # some examples what we need
    examples=[
        ("Alice and Bob are friends", [
         {"first_name": "Alice"}, {"first_name": "Bob"}])
    ],
    many=True,
)

In [39]:
# create a chain to extract information
chain = create_extraction_chain(llm, person_schema)
chain

LLMChain(prompt=ExtractionPromptTemplate(input_variables=['text'], output_parser=KorParser(encoder=<kor.encoders.csv_data.CSVEncoder object at 0x11e9ae570>, schema_=Object(id='person', description='Personal information about a person', many=True, attributes=[Text(id='first-name', description='The first name of a person.', many=False, examples=())], examples=[('Alice and Bob are friends', [{'first_name': 'Alice'}, {'first_name': 'Bob'}])])), encoder=<kor.encoders.csv_data.CSVEncoder object at 0x11e9ae570>, node=Object(id='person', description='Personal information about a person', many=True, attributes=[Text(id='first-name', description='The first name of a person.', many=False, examples=())], examples=[('Alice and Bob are friends', [{'first_name': 'Alice'}, {'first_name': 'Bob'}])]), type_descriptor=<kor.type_descriptors.TypeScriptDescriptor object at 0x11f3ff2f0>, instruction_template=PromptTemplate(input_variables=['format_instructions', 'type_description'], template="Your goal is to

In [40]:
# text to extract the info from
text = """
    My name is Bobby.
    My sister's name is Rachel.
    My brother's name Joe. My dog's name is Spot
"""

text

"\n    My name is Bobby.\n    My sister's name is Rachel.\n    My brother's name Joe. My dog's name is Spot\n"

In [43]:
output = chain.run(text=(text))["data"]
printOutput(output)
# output

{
   "person": [
      {
         "first-name": "Bobby"
      },
      {
         "first-name": "Rachel"
      },
      {
         "first-name": "Joe"
      }
   ]
}


In [46]:
output = chain.run(text=("The dog went to the park"))["data"]
output

{'person': [{'first-name': ''}]}

## MULTIPLE FIELDS


In [47]:
# define schema
plant_schema = Object(
    id="plant",
    description="Information about a plant",
    attributes=[
        Text(
            id="plant_type",
            description="The common name of the plant"
        ),
        Text(
            id="color",
            description="The color of plant"
        ),
        Number(
            id="rating",
            description="The rating of the plant"
        ),
    ],
    examples=[
        (
            "Roses are red, lilies are white and an 8 out of 10.",
            [
                {"plant_type": "Roses", "color": "red"},
                {"plant_type": "Lily", "color": "white", "rating": 8}
            ]
        )
    ],
)

In [49]:
plant_schema

Object(id='plant', description='Information about a plant', many=False, attributes=[Text(id='plant_type', description='The common name of the plant', many=False, examples=()), Text(id='color', description='The color of plant', many=False, examples=()), Number(id='rating', description='The rating of the plant', many=False, examples=())], examples=[('Roses are red, lilies are white and an 8 out of 10.', [{'plant_type': 'Roses', 'color': 'red'}, {'plant_type': 'Lily', 'color': 'white', 'rating': 8}])])

In [51]:
text = "Palm trees are brown with a 6 rating. Sequoia trees are green"
chain = create_extraction_chain(llm, plant_schema)

In [52]:
output = chain.run(text=text)["data"]
printOutput(output)

{
   "plant": [
      {
         "color": "brown",
         "plant_type": "Palm trees",
         "rating": "6.0"
      },
      {
         "color": "green",
         "plant_type": "Sequoia trees",
         "rating": ""
      }
   ]
}


## WORKING WITH LISTS