## Basic Extraction

### Basic Extraction from Retriever

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.docarray import DocArrayInMemorySearch
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.document_loaders.web_base import WebBaseLoader

In [13]:
URL = "https://en.wikipedia.org/wiki/Charles_Biddle"
loader = WebBaseLoader(URL)
documents = loader.load()

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)
embeddings = OpenAIEmbeddings()
print(len(documents))

db = DocArrayInMemorySearch.from_documents(documents=documents, embedding=embeddings)
retriever = db.as_retriever()

retriever.get_relevant_documents("first name")


1


[Document(page_content='\n\n\n\nCharles Biddle - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\nLanguages\n\nLanguage links are at the top of the page.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\n Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1Early life\n\n\n\n\n\n\n\n2Career\n\n\n\nToggle Career subsection\n\n\n\n\n\n2.1American Rev

In [15]:
basic_info_function = {
    'name': 'Basic_Information',
    'description': 'Basic information to extract about the person subject of a text.',
    'parameters': {
        'title': 'Basic Information',
        'description': 'Basic information to extract about the person subject of a text.',
        'type': 'object',
        'properties': {
            'basic_info': {
                'title': 'Basic Info',
                'description': 'Basic info about the person subject.',
                'type': 'object',
                'properties': {
                    'first_name': {
                        'title': 'First Name',
                        'description': 'persons first name',
                        'type': 'string'
                    },
                    'last_name': {
                        'title': 'Last Name',
                        'description': 'persons last name',
                        'type': 'string'
                    },
                    'birth_year': {
                        'title': 'Birth Year',
                        'description': 'persons birth year',
                        'type': 'string'
                    },
                    'death_year': {
                        'title': 'Death Year',
                        'description': 'persons death year',
                        'type': 'string'
                    }
                },
                'required': ['first_name', 'last_name']
            },
        },
        'required': ['basic_info']
    }
}

extraction_functions = [basic_info_function]

In [16]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser, JsonOutputFunctionsParser
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.runnables import RunnableMap
from langchain_openai.chat_models import ChatOpenAI

In [17]:
template = """A article will be passed to you. Extract from it the basic information from the article about the person who is the subject. 

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])


basic_info_model = ChatOpenAI(temperature=0.0).bind(functions=extraction_functions)


extraction_chain = (
    RunnableMap({
        "input": lambda x: retriever.get_relevant_documents(query=x["question"])
    }) 
    | prompt 
    | basic_info_model 
    | JsonOutputFunctionsParser()
)



In [18]:
extraction_chain.invoke({"question": "Charles Biddle"})

{'basic_info': {'first_name': 'Charles',
  'last_name': 'Biddle',
  'birth_year': '1745',
  'death_year': '1821'}}

### Let's Pass a URL and Schema...

In [19]:
from langchain.text_splitter import CharacterTextSplitter

In [20]:
# Wikipedia Person Pages
PREVOST_URL = "https://en.wikipedia.org/wiki/Jacques_Marcus_Prevost"
BOUQUET_URL = "https://en.wikipedia.org/wiki/Henry_Bouquet"
DICKINSON_URL = "https://en.wikipedia.org/wiki/John_Dickinson"

# Variable schema to be passed to extraction chain 
BASIC_INFO_SCHEMA = {
    'properties': {
        'first_name': {
            'title': 'First Name',
            'description': 'persons first name',
            'type': 'string'
        },
        'last_name': {
            'title': 'Last Name',
            'description': 'persons last name',
            'type': 'string'
        },
        'birth_year': {
            'title': 'Birth Year',
            'description': 'persons birth year',
            'type': 'string'
        },
        'death_year': {
            'title': 'Death Year',
            'description': 'persons death year',
            'type': 'string'
        }
    },
    'required': ['first_name', 'last_name']
}

In [21]:
embeddings = OpenAIEmbeddings()

def get_embeddings_for_query(x):
    WIKI_URL = x['url']
    schema = x['schema']

    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    raw_docs = WebBaseLoader(WIKI_URL).load()
    split_docs = splitter.split_documents(raw_docs)

    db = DocArrayInMemorySearch.from_documents(documents=split_docs, embedding=embeddings).as_retriever() # Do I need this abstraction..

    # docs = {} Was going to use dict for tracking duplicates... 
    rel_doc_list = []
    for i, key in enumerate(schema['properties'].keys()):
        docs = db.get_relevant_documents(query=key)
        #print(f"\nKey: {key}\nDoc:{docs[0]}")
        rel_doc_list.append(docs[0].page_content)
    return {"input": rel_doc_list}

In [22]:
get_embeddings_for_query({"url": PREVOST_URL, "schema": {"properties": {"first_name": "", "last_name": "", "year_of_birth":"", "year of death": ""}}})

Created a chunk of size 1509, which is longer than the specified 1000
Created a chunk of size 1265, which is longer than the specified 1000


{'input': ['This page was last edited on 9 March 2023, at 10:55\xa0(UTC).\nText is available under the Creative Commons Attribution-ShareAlike License 4.0;\nadditional terms may apply.  By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.\n\n\nPrivacy policy\nAbout Wikipedia\nDisclaimers\nContact Wikipedia\nCode of Conduct\nDevelopers\nStatistics\nCookie statement\nMobile view\n\nToggle limited content width',
  "Marriage and family[edit]\nWhile in New York City convalescing, Jacques Marc had met Theodosia Stillwell Bartow. They married in Trinity Church in 1763. They had five children together, including Augustine James Frederick Prevost (1765–1842) and John Bartow Prevost (1766–1825).\nWhile Jacques was away fighting for the British in the West Indies, his wife Theodosia formed a relationship with an American colonel named Aaron Burr, who was ten years younger than her.

In [23]:
def get_extraction_function(entity_schema: dict) -> dict:
    return {
        "name": "basic_info",
        "description": "Basic information to extract about the person subject of a text.",
        "parameters": {
            "type": "object",
            "properties": {
                "info": {"type": "array", "items": _convert_schema(entity_schema)}
            },
            "required": ["info"],
        },
    }

def _convert_schema(schema: dict) -> dict:
    props = {k: {"title": k, **v} for k, v in schema["properties"].items()}
    return {
        "type": "object",
        "properties": props,
        "required": schema.get("required", []),
    }

def get_llm_kwargs(function: dict) -> dict:
    """Returns the kwargs for the LLMChain constructor.

    Args:
        function: The function to use.

    Returns:
        The kwargs for the LLMChain constructor.
    """
    return {"functions": [function], "function_call": {"name": function["name"]}}

In [24]:
def invoke_model_with_schema(x):
    model = ChatOpenAI(temperature=0.0)
    schema = x["schema"]
    function_def = get_extraction_function(schema)
    res = model.invoke(x['input'].messages, **get_llm_kwargs(function_def))
    
    return res

In [25]:
# Best way I know how for now... - rough draft

from operator import itemgetter
from langchain_core.runnables import RunnableLambda

person_extraction_chain = (
    {
        "input": get_embeddings_for_query | prompt,
        "schema": itemgetter("schema")
    }
    | RunnableLambda(invoke_model_with_schema) 
    | JsonOutputFunctionsParser()
)

In [26]:
person_extraction_chain.invoke({"url": PREVOST_URL, "schema": BASIC_INFO_SCHEMA})

Created a chunk of size 1509, which is longer than the specified 1000
Created a chunk of size 1265, which is longer than the specified 1000


{'info': [{'first_name': 'Jacques',
   'last_name': 'Prevost',
   'birth_year': '1736',
   'death_year': ''}]}

In [27]:
person_extraction_chain.invoke({"url": DICKINSON_URL, "schema": BASIC_INFO_SCHEMA})

Created a chunk of size 3149, which is longer than the specified 1000
Created a chunk of size 3049, which is longer than the specified 1000
Created a chunk of size 4015, which is longer than the specified 1000
Created a chunk of size 2917, which is longer than the specified 1000
Created a chunk of size 1607, which is longer than the specified 1000
Created a chunk of size 2538, which is longer than the specified 1000
Created a chunk of size 2927, which is longer than the specified 1000
Created a chunk of size 2358, which is longer than the specified 1000
Created a chunk of size 1984, which is longer than the specified 1000
Created a chunk of size 4204, which is longer than the specified 1000
Created a chunk of size 2346, which is longer than the specified 1000
Created a chunk of size 1441, which is longer than the specified 1000
Created a chunk of size 7517, which is longer than the specified 1000
Created a chunk of size 2004, which is longer than the specified 1000


{'info': [{'first_name': 'John',
   'last_name': 'Dickinson',
   'birth_year': '1732',
   'death_year': '1808'}]}

In [30]:
# Create more complex schema

MORE_COMPLEX_INFO_SCHEMA = {
    'properties': {
        'first_name': {
            'title': 'First Name',
            'description': 'persons first name',
            'type': 'string'
        },
        'last_name': {
            'title': 'Last Name',
            'description': 'persons last name',
            'type': 'string'
        },
        'children': {
            'title': 'Children',
            'description': 'persons children',
            'type': 'array',
            'items': {
                'type': 'object',
                'title': 'Child',
                'description': 'Child of person',
                'properties': {
                    'first_name': {
                        'title': 'First Name',
                        'description': 'childs first name',
                        'type': 'string'
                    },
                    'last_name': {
                        'title': 'Last Name',
                        'description': 'childs last name',
                        'type': 'string'
                    }
                },
                'required': ['first_name', 'last_name']
            }
        },
        'cause_of_death': {
            'title': 'Cause of Death',
            'description': 'persons cause of death',
            'type': 'string'
        },
        'views_on_war': {
            'title': 'Views on War',
            'description': 'persons views on war',
            'type': 'object',
            'items': {
                'type': 'object',
                'title': 'View on War',
                'description': 'View on War',
                'properties': {
                    'war': {
                        'title': 'War',
                        'description': 'war',
                        'type': 'string'
                    },
                    'view': {
                        'title': 'View',
                        'description': 'view on war',
                        'type': 'string'
                    }
                },
                'required': ['war', 'view']
            }
        },
    },
    'required': ['first_name', 'last_name', 'views_on_war']
}

person_extraction_chain.invoke({"url": DICKINSON_URL, "schema": MORE_COMPLEX_INFO_SCHEMA})

Created a chunk of size 3149, which is longer than the specified 1000
Created a chunk of size 3049, which is longer than the specified 1000
Created a chunk of size 4015, which is longer than the specified 1000
Created a chunk of size 2917, which is longer than the specified 1000
Created a chunk of size 1607, which is longer than the specified 1000
Created a chunk of size 2538, which is longer than the specified 1000
Created a chunk of size 2927, which is longer than the specified 1000
Created a chunk of size 2358, which is longer than the specified 1000
Created a chunk of size 1984, which is longer than the specified 1000
Created a chunk of size 4204, which is longer than the specified 1000
Created a chunk of size 2346, which is longer than the specified 1000
Created a chunk of size 1441, which is longer than the specified 1000
Created a chunk of size 7517, which is longer than the specified 1000
Created a chunk of size 2004, which is longer than the specified 1000


{'info': [{'first_name': 'John',
   'last_name': 'Dickinson',
   'children': [{'first_name': 'William', 'last_name': 'Dickinson'},
    {'first_name': 'Walter', 'last_name': 'Dickinson'},
    {'first_name': 'Samuel', 'last_name': 'Dickinson'},
    {'first_name': 'Elizabeth', 'last_name': 'Dickinson'},
    {'first_name': 'Henry', 'last_name': 'Dickinson'},
    {'first_name': 'Elizabeth', 'last_name': 'Dickinson'},
    {'first_name': 'Rebecca', 'last_name': 'Dickinson'},
    {'first_name': 'Rachel', 'last_name': 'Dickinson'}],
   'cause_of_death': 'unknown'}]}

Implementation is a start. Next is developing better retrieval strategies, testing models, sequential calls for the schema, token tracking, prompt engineering.

In [33]:
SIMPLE_COMPLEX_INFO_SCHEMA = {
    'properties': {
        'views_on_war': {
            'title': 'Views on War',
            'description': 'persons views on war',
            'type': 'object',
            'items': {
                'type': 'object',
                'title': 'View on War',
                'description': 'View on War',
                'properties': {
                    'war': {
                        'title': 'War',
                        'description': 'war',
                        'type': 'string'
                    },
                    'view': {
                        'title': 'View',
                        'description': 'view on war',
                        'type': 'string'
                    }
                },
                'required': ['war', 'view']
            }
        },
    },
    'required': ['views_on_war']
}

person_extraction_chain.invoke({"url": DICKINSON_URL, "schema": SIMPLE_COMPLEX_INFO_SCHEMA})

Created a chunk of size 3149, which is longer than the specified 1000
Created a chunk of size 3049, which is longer than the specified 1000
Created a chunk of size 4015, which is longer than the specified 1000
Created a chunk of size 2917, which is longer than the specified 1000
Created a chunk of size 1607, which is longer than the specified 1000
Created a chunk of size 2538, which is longer than the specified 1000
Created a chunk of size 2927, which is longer than the specified 1000
Created a chunk of size 2358, which is longer than the specified 1000
Created a chunk of size 1984, which is longer than the specified 1000
Created a chunk of size 4204, which is longer than the specified 1000
Created a chunk of size 2346, which is longer than the specified 1000
Created a chunk of size 1441, which is longer than the specified 1000
Created a chunk of size 7517, which is longer than the specified 1000
Created a chunk of size 2004, which is longer than the specified 1000


{'info': [{'source': 'Ehrlich, Eugene and Gorton Carruth. The Oxford Illustrated Literary Guide to the United States. New York: Oxford University Press, 1982. p. 217. ISBN 0-19-503186-5'},
  {'source': "Lincoln's Little War by Webb B. Garrison, pg. 60"},
  {'source': 'The Thirteen Colonies: Travel Historic America pg. 62'},
  {'source': '"Student finds letter \'a link to Jefferson\'". CNN.com. December 8, 2009. Retrieved May 6, 2010.'},
  {'source': '"Odd Wisconsin Archives". Wisconsinhistory.org. March 29, 2006. Archived from the original on April 23, 2006. Retrieved September 12, 2012.'},
  {'source': 'Rabinowitz, Chloe. "EXCEPT MR. DICKINSON World Premiere to be Presented by 15th Street Friends". BroadwayWorld.com.'},
  {'source': 'Colbourn, Trevor H. (1959). "John Dickinson, Historical Revolutionary". The Pennsylvania Magazine of History and Biography. 83 (3): 271–292. JSTOR 20089207.'},
  {'source': 'Powell, John H. "John Dickinson and the Constitution." The Pennsylvania Magazine 