In [23]:
# imports
import asyncio
import textwrap
from pathlib import Path

# packages
# Kelvin does not require the use of Pandas, but it's a great way to consume and organize results.
import pandas
from IPython.display import HTML, Markdown, display

# kelvin clients
from kelvin.api.document_index.async_client import KelvinDocumentIndexAsyncClient
from kelvin.api.document_index.commands.load_folder import load_folder
from kelvin.api.nlp.async_client import KelvinNLPAsyncClient

In [2]:
# setup the clients
doc_client = KelvinDocumentIndexAsyncClient()
nlp_client = KelvinNLPAsyncClient()

In [14]:
# show files we're going to upload
data_folder = Path("../data/bluth-sample")
await load_folder(data_folder, num_workers=4, progress=True)

Queuing files: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 2989.29it/s]
Uploading files: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 87.77it/s]


## Upload and deduplicate to centralized storage

Kelvin makes it easy to upload, organize, deduplicate, and share documents across a deal team.

In [34]:
# list the files we loaded
document_data = []
documents = await doc_client.get_documents()

for document in documents:
    for document_instance in await doc_client.get_document_instances(document["id"]):
        document_data.append(document_instance)

        
display(HTML("<h1>Document File Instances (deduplicated)</h1>"))
display(pandas.DataFrame(document_data).head())

Unnamed: 0,id,document_id,file_name,created_at,updated_at
0,1,1,List of Liabilities.docx,2023-04-21T18:08:53.569800,2023-04-21T18:08:53.569800
1,2,2,Employee Stock Ownership Plan for the Bluth Co...,2023-04-21T18:08:53.585452,2023-04-21T18:08:53.585452
2,3,3,Bluth Banana Cloud - Open Source.docx,2023-04-21T18:08:53.585917,2023-04-21T18:08:53.585917
3,4,4,Cornballer Patent.docx,2023-04-21T18:08:53.748395,2023-04-21T18:08:53.748395
4,5,5,SWOT Analysis.docx,2023-04-21T18:08:59.048763,2023-04-21T18:08:59.048763


## Automate summarization

Use Kelvin to automatically summarize documents across a variety of common formats, including Word, PDF, Excel, PowerPoint, emails, WordPerfect, images, and more.

In [37]:
# summarize the contents of all files
for document in documents[:5]:
    # get the first found file name (multiple names are possible with duplicate files)
    document_instances = await doc_client.get_document_instances(document["id"])
    file_name = document_instances[0]["file_name"]
    
    # get a default summarization of the document using GPT-3.5
    display(HTML(f"<h3>Summary: {file_name} (id={document['id']})</h3>"))
    try:
        summary = await doc_client.get_document_summary(document["id"], engine="gpt-3.5-turbo")
        display(HTML(f"<pre>{summary['summary']}</pre>"))
    except ValueError:
        display(HTML(f"<pre>No summary available.</pre>"))
    

## Answer questions about specific documents

You can use Kelvin to answer questions about specific documents.  Just pick a registered LLM engine and ask your question.

In [60]:
# answer questions
question = "How many debt instruments does the company have?"
answer = await doc_client.get_document_answer(1, "gpt-3.5-turbo", question)
display(HTML(f"<strong>Question:</strong> {question}"))
display(HTML(f"<strong>Answer:</strong> {answer['answer']}"))

In [89]:
# answer questions
question = "How do employees vest?"
answer = await doc_client.get_document_answer(2, "gpt-3.5-turbo", question)
display(HTML(f"<strong>Question:</strong> {question}"))
display(HTML(f"<strong>Answer:</strong> {answer['answer']}"))

## Build custom search workflows

Write the exact search you want.

In [90]:
# salary data
salary_data = []

# find all sentences or paragraphs containing salary
segments = await doc_client.search_document_segment_contents("salary")
for segment in segments:
    # get the first unique file name
    document_instances = await doc_client.get_document_instances(segment["document_id"])
    file_name = document_instances[0]["file_name"]
    
    # get the name of the employee
    employee_name = await doc_client.get_document_answer(segment["document_id"], "gpt-3.5-turbo", "Respond with only the name of the employee.")
    employee_bonus = await doc_client.get_document_answer(segment["document_id"], "gpt-3.5-turbo", "Does the employee have a bonus plan?.")
    
    # get the text and extract monetary amounts
    segment_text = await doc_client.get_document_segment(segment["id"])
    money_values = await nlp_client.get_money(segment_text['text'])
    
    for money in money_values['moneys']:
        salary_data.append({
            "file_name": file_name,
            "name": employee_name["answer"],
            "bonus": employee_bonus["answer"],
            "text": money['text'],
            "quantity": money['quantity'],
            "currency": money['currency'],
        })

display(HTML("<h3>Salary Table</h3>"))
display(pandas.DataFrame(salary_data))

Unnamed: 0,file_name,name,bonus,text,quantity,currency
0,Employment Agreement - Lucille Bluth.docx,Lucille Bluth.,"Yes, the employee has a bonus plan based on th...","$500,000,",500000.0,USD
1,Employment Agreement - Michael Bluth.docx,Michael Bluth.,"Yes, the employee has a bonus plan.","$450,000,",450000.0,USD
2,Employment Agreement - Buster Bluth.docx,Buster Bluth,"Yes, the employee is eligible to earn annual b...","$290,000,",290000.0,USD


## Semantic Similarity with Kelvin Vector and Kelvin Embeddings

Search using Kelvin Vector and Kelvin's legal-specific embeddings.

In [103]:
query = """Retention bonuses are taxable income to the employee and must be added to the employee's compensation 
in the year in which they are awarded. In view of this, the company, as an additional retention incentive, will
provide a "gross up" to employee income by paying the taxes for retention bonuses so that employees will receive 
the full amount indicated above "net of taxes."""

segments = await doc_client.search_document_segment_vectors(query=query, vector_type="en-001-small", k=3, threshold=10)
for segment in segments:
    # get the first unique file name
    document_instances = await doc_client.get_document_instances(segment["document_segment"]["document_id"])
    file_name = document_instances[0]["file_name"]
    
    # get the text and extract monetary amounts
    segment_text = await doc_client.get_document_segment(segment["document_segment"]["id"])
    
    display(HTML(f"<strong>{file_name}:</strong>"))
    display(HTML(f"<pre>{textwrap.fill(segment_text['text'], 80)}</pre><br />"))


## Semantic Similarity with Kelvin Vector and External Embeddings

You can also use Kelvin Vector with external embeddings like those from OpenAI or Hugging Face models like T5.  In general, we have found our embeddings to outperform these generic embeddings.

In [104]:
query = """Retention bonuses are taxable income to the employee and must be added to the employee's compensation 
in the year in which they are awarded. In view of this, the company, as an additional retention incentive, will
provide a "gross up" to employee income by paying the taxes for retention bonuses so that employees will receive 
the full amount indicated above "net of taxes."""

segments = await doc_client.search_document_segment_vectors(query=query, vector_type="text-embedding-ada-002", k=3, threshold=10)
for segment in segments:
    # get the first unique file name
    document_instances = await doc_client.get_document_instances(segment["document_segment"]["document_id"])
    file_name = document_instances[0]["file_name"]
    
    # get the text and extract monetary amounts
    segment_text = await doc_client.get_document_segment(segment["document_segment"]["id"])
    
    display(HTML(f"<strong>{file_name}:</strong>"))
    display(HTML(f"<pre>{textwrap.fill(segment_text['text'], 80)}</pre><br />"))
