In [7]:
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain  # noqa E501
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import create_qa_with_sources_chain, RetrievalQA

import itertools as it
from langchain.callbacks import get_openai_callback


from llama_index import VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index import Document
from llama_index import LLMPredictor
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

import pandas as pd
import json
import re

In [131]:
from openai import OpenAI
client = OpenAI()

print(client.models.list().data)

[Model(id='text-search-babbage-doc-001', created=1651172509, object='model', owned_by='openai-dev'), Model(id='gpt-4', created=1687882411, object='model', owned_by='openai'), Model(id='gpt-3.5-turbo-16k', created=1683758102, object='model', owned_by='openai-internal'), Model(id='curie-search-query', created=1651172509, object='model', owned_by='openai-dev'), Model(id='text-davinci-003', created=1669599635, object='model', owned_by='openai-internal'), Model(id='text-search-babbage-query-001', created=1651172509, object='model', owned_by='openai-dev'), Model(id='babbage', created=1649358449, object='model', owned_by='openai'), Model(id='babbage-search-query', created=1651172509, object='model', owned_by='openai-dev'), Model(id='text-babbage-001', created=1649364043, object='model', owned_by='openai'), Model(id='text-similarity-davinci-001', created=1651172505, object='model', owned_by='openai-dev'), Model(id='gpt-3.5-turbo-1106', created=1698959748, object='model', owned_by='system'), Mo

In [22]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import json
import openai
import os

with open('../local_settings.json') as f:
    data = json.load(f)

# Set form recogniser client
credential = AzureKeyCredential(data["FORM_KEY"])
document_analysis_client = DocumentAnalysisClient(data["FORM_ENDPOINT"], credential)



# This example also requires an OpenAI API key
os.environ['OPENAI_API_KEY'] = data['OPENAI_API_KEY']
openai.api_key = os.environ['OPENAI_API_KEY']

from azure.ai.formrecognizer import FormRecognizerClient
form_recognizer_client = FormRecognizerClient(data["FORM_ENDPOINT"], credential)

In [23]:
path = "../data/example/Elastacloud-Brand-Book.pdf"

# Analyze the document
with open(path, "rb") as f:
    poller = document_analysis_client.begin_analyze_document("prebuilt-document", f)
    result = poller.result()

In [106]:

map_prompt = """
Consider the following text as a collection of potential unstructured data:
"{text}"
###
Report back a list of possible data fields to extract ordered by relevance.
FIELDS:
"""
map_prompt_template = PromptTemplate(template=map_prompt,
                                     input_variables=["text"])


reduce_prompt = """
Make a curated list from the possible fields delimited by triple backquotes extracted from a text.
Return a list with only the name of the Fields that could be extracted, separated by commas.
```{text}```
"""
reduce_prompt_template = PromptTemplate(template=reduce_prompt,
                                        input_variables=["text"])


def summarization_chain(verbose=False):
    llm = OpenAI(temperature=0, max_tokens=512)

    map_chain = LLMChain(llm=llm, prompt=map_prompt_template, verbose=verbose)
    reduce_chain = LLMChain(llm=llm,
                            prompt=reduce_prompt_template,
                            verbose=verbose)

    combine_document_chain = StuffDocumentsChain(
        llm_chain=reduce_chain,
        document_variable_name="text",
        verbose=verbose,
    )

    mapreduce_chain = MapReduceDocumentsChain(
        llm_chain=map_chain,
        combine_document_chain=combine_document_chain,
        document_variable_name=combine_document_chain.document_variable_name,
        verbose=verbose
    )

    return mapreduce_chain


def split_text(text,
               separators=["\n\n", "\n", " "],
               chunk_size=3000,
               chunk_overlap=500):
    text_splitter = RecursiveCharacterTextSplitter(
        separators=separators + [""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap)
    docs = text_splitter.create_documents([text])
    return docs


def split_in_chuncks(docs, number_of_chunks):

    chunk_size = (len(docs) // number_of_chunks)

    groups = [
        [group for _, group in enumerated_group]
        for _, enumerated_group in it.groupby(
                enumerate(docs),
                key=lambda e: e[0] // chunk_size
                )
            ]
    return groups


def summarize_text(text, number_of_chunks, include_costs=False):
    with get_openai_callback() as cb:
        chain = summarization_chain()
        docs = split_text(text)
        groups = split_in_chuncks(docs, number_of_chunks)
        summaries = [chain.run(group) for group in groups]
        joined_summary = "\n\n".join(summaries)

        if include_costs:
            return joined_summary, cb
        return joined_summary


In [107]:
fields = summarize_text(result.content, 1)
print(fields)


Brand Name, Date, Vision, Mission, Values, Brand Positioning, Brand Proposition, Brand Strap Line, Elevator Pitch, Core Messages, Business Objectives, Knowledge, Expertise, Intelligence-led Data Solutions, Global Brand Leaders, Talent, Cutting-edge, Ground-breaking Data Science and Intelligence, Global Community, Culture of Pioneering, Freedom to Experiment, Big Challenges, Problem-solving attitude, Disruptive/Maverick behavior, Encouragement of experimentation, Promotion of collaboration, Emphasis on creativity, Recognition of the importance of play, Participation in company-wide tournaments, Commitment to giving back, Investment in projects that benefit humanity, Sponsorship of computer labs, Promotion of inclusivity, Support of collaboration, Nurturing of employees, Valuing of employees, Encouragement of a healthy work-life balance, Location, Mentoring activities, Values, Sponsorship, Culture, Core competencies, Strategic advisors, Team Members, Company Culture, Logo, Icon, Brand S

In [108]:
fields.split(",")

['\nBrand Name',
 ' Date',
 ' Vision',
 ' Mission',
 ' Values',
 ' Brand Positioning',
 ' Brand Proposition',
 ' Brand Strap Line',
 ' Elevator Pitch',
 ' Core Messages',
 ' Business Objectives',
 ' Knowledge',
 ' Expertise',
 ' Intelligence-led Data Solutions',
 ' Global Brand Leaders',
 ' Talent',
 ' Cutting-edge',
 ' Ground-breaking Data Science and Intelligence',
 ' Global Community',
 ' Culture of Pioneering',
 ' Freedom to Experiment',
 ' Big Challenges',
 ' Problem-solving attitude',
 ' Disruptive/Maverick behavior',
 ' Encouragement of experimentation',
 ' Promotion of collaboration',
 ' Emphasis on creativity',
 ' Recognition of the importance of play',
 ' Participation in company-wide tournaments',
 ' Commitment to giving back',
 ' Investment in projects that benefit humanity',
 ' Sponsorship of computer labs',
 ' Promotion of inclusivity',
 ' Support of collaboration',
 ' Nurturing of employees',
 ' Valuing of employees',
 ' Encouragement of a healthy work-life balance',
 ' 

In [77]:
fields[10:].replace("\n", "").replace(" ", "")[-20:]

'lationships","\'Magic'

In [111]:
json_template = json.dumps({"Results": [dict(zip(fields.split(","), ["<answer>"]*len(fields.split(","))))]})


system = f"""
You are an assistant that given a text extracted using OCR from a document will extract user provided data fields.
Fields can have multiple formats.
Write your output as a JSON with the format {json_template}.
If there is a field that you can not find, set it a null.
If there is any additional information of feedback from the infromation extraction, add a {{"notes": "<additional-information>"}}
"""   # noqa E501


def extract_unstructured(extracted_text, system):

    documents = [Document(text=extracted_text.content)]

    node_parser = SimpleNodeParser.from_defaults(chunk_size=4096,
                                                 chunk_overlap=200)
                                                 
    llm = ChatOpenAI(temperature=0, max_tokens=512)
    llm_predictor = LLMPredictor(llm=llm)
    service_context = ServiceContext.from_defaults(node_parser=node_parser,
                                                   llm_predictor=llm_predictor)

    index = VectorStoreIndex.from_documents(documents,
                                            service_context=service_context)
    query_engine = index.as_query_engine()
    response = query_engine.query(system)

    return response.response

In [113]:
response = extract_unstructured(result, system)

In [114]:
print(response)

{"Results": [{"\nBrand Name": "Elastacloud", " Date": "March 2023", " Vision": "Global industry transformation through data and next generation Al", " Mission": "To pioneer through data for the greater good", " Values": "Pioneering through data and next generation Al\nTrust and transparency\nAccountability\nCommunity and sustainability\nDiversity and inclusion", " Brand Positioning": "Elastacloud are data pioneers and industry-leading innovators who challenge the limits of possibility to accelerate transformation and deliver game-changing insights and outcomes to some of the world's leading brands. With a culture of pioneering for the greater good, and utilising cutting-edge data science and intelligence, we unleash the power and opportunity hidden within data through the design and implementation of world-class, enterprise-grade solutions.", " Brand Proposition": "Utilising pioneering techniques, data science, intelligence, and innovation, we will unleash the power and opportunity hid

## Per page approach

In [None]:
def automatic_extract_unstructured(extracted_text):
    text = extracted_text.content

    json_template = json.dumps({
    "Results" : {"field 1 name":"<field 1 content>", "field 2 name":"<field 2 content>", "field 3 name":"<field 3 content>", "field 4 name":"<field 4 content>", "field n name":"<field n content>"}
    })

    system = f"""
    Consider a text input as a collection of unstructured data.
    Your task is to identify data fields.
    Report back a list of n possible data fields with the format {json_template} 
    """

    from openai import OpenAI
    client = OpenAI()

    completion = client.chat.completions.create(
    model="gpt-3.5-turbo-16k",
    messages=[
        {"role": "system", "content": system},
        {"role": "user", "content": text}
    ]
    )

    return completion.choices[0].message.content


In [128]:
path = "../data/example/Elastacloud-Brand-Book.pdf"

# Analyze the document
with open(path, "rb") as f:
    poller = document_analysis_client.begin_analyze_document("prebuilt-document", f)
    result = poller.result()

text = result.content

json_template = json.dumps({
  "Results" : {"field 1 name":"<field 1 content>", "field 2 name":"<field 2 content>", "field 3 name":"<field 3 content>", "field 4 name":"<field 4 content>", "field n name":"<field n content>"}
})

system = f"""
Consider a text input as a collection of unstructured data.
Your task is to identify data fields.
Report back a list of n possible data fields with the format {json_template} 
"""

from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model="gpt-3.5-turbo-16k",
  messages=[
    {"role": "system", "content": system},
    {"role": "user", "content": text}
  ]
)

print(completion.choices[0].message)



ChatCompletionMessage(content='{"Results": {"Brand Name": "ELASTACLOUD", "Vision": "Global industry transformation through data and next generation AI", "Mission": "To pioneer through data for the greater good - utilizing cutting-edge data science and intelligence to create a better, more connected, and sustainable world", "Values": "Trust and transparency, Accountability, Community and sustainability, Diversity and inclusion", "Brand Positioning": "Elastacloud are data pioneers and industry-leading innovators who challenge the limits of possibility to accelerate transformation and deliver game-changing insights and outcomes to some of the world\'s leading brands", "Brand Proposition": "Utilizing pioneering techniques, data science, intelligence, and innovation, we will unleash the power and opportunity hidden within your data to accelerate transformation and deliver game-changing insights and outcomes to your business, fast", "Brand Strapline": "Pioneering Through Data", "Elevator Pit

In [129]:
json.loads(completion.choices[0].message.content)

{'Results': {'Brand Name': 'ELASTACLOUD',
  'Vision': 'Global industry transformation through data and next generation AI',
  'Mission': 'To pioneer through data for the greater good - utilizing cutting-edge data science and intelligence to create a better, more connected, and sustainable world',
  'Values': 'Trust and transparency, Accountability, Community and sustainability, Diversity and inclusion',
  'Brand Positioning': "Elastacloud are data pioneers and industry-leading innovators who challenge the limits of possibility to accelerate transformation and deliver game-changing insights and outcomes to some of the world's leading brands",
  'Brand Proposition': 'Utilizing pioneering techniques, data science, intelligence, and innovation, we will unleash the power and opportunity hidden within your data to accelerate transformation and deliver game-changing insights and outcomes to your business, fast',
  'Brand Strapline': 'Pioneering Through Data',
  'Elevator Pitch': 'We can unlea

In [132]:
# This example also requires an OpenAI API key
os.environ['OPENAI_API_KEY'] = data['OPENAI_API_KEY']
openai.api_key = os.environ['OPENAI_API_KEY']

from openai import OpenAI
client = OpenAI()

In [134]:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
    {"role": "system", "content": "You are a chatbot"},
    {"role": "user", "content": "Hi, how are you?"}
]
)

In [144]:
completion.choices[0].message.content

"Hello! As a chatbot, I don't have feelings, but I'm here to help you. How can I assist you today?"

In [164]:
embeddings = client.embeddings.create(
  model="text-embedding-ada-002",
  input="Barack Hussein Obama II (/bəˈrɑːk huːˈseɪn oʊˈbɑːmə/ ⓘ bə-RAHK hoo-SAYN oh-BAH-mə;[1] born August 4, 1961) is an American politician who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, he was the first African-American president. Obama previously served as a U.S. senator representing Illinois from 2005 to 2008, as an Illinois state senator from 1997 to 2004, and as a civil rights lawyer and university lecturer.",
  encoding_format="float"
)

In [155]:
embeddings.data[0].embedding

[-0.0071006604,
 -0.017430507,
 -0.009666266,
 -0.030631134,
 -0.012618665,
 0.0031245034,
 -0.004992818,
 -0.041220024,
 -0.014563273,
 -0.021348111,
 0.019247366,
 0.050730154,
 -0.0012570759,
 0.0025904458,
 -0.03840957,
 -0.0060751275,
 0.035513945,
 -0.0047089336,
 0.002324304,
 -0.013512901,
 -0.0189209,
 0.009034623,
 0.01589753,
 -0.008644282,
 -0.014577467,
 0.0071432428,
 0.013065782,
 -0.01334257,
 0.0029523985,
 0.004843779,
 0.0039672856,
 -0.016820155,
 -0.01579817,
 -0.043036886,
 -0.027096773,
 -0.004290204,
 0.007948765,
 -0.009957247,
 0.021986851,
 -0.009055914,
 0.0049253954,
 0.00028388447,
 -0.012128964,
 0.013094171,
 -0.0038395375,
 0.006926781,
 -0.022100406,
 -0.004435695,
 0.0013803883,
 0.01393163,
 0.0025620572,
 0.008161678,
 -0.011334088,
 0.0102553265,
 -0.0050992747,
 0.0027998106,
 0.007920377,
 -0.0126825385,
 0.013214822,
 0.0023349498,
 0.015869142,
 0.004123422,
 -0.0018328291,
 0.022668175,
 0.011177951,
 -0.003442099,
 -0.0074093845,
 0.000422278

In [167]:
embeddings2 = client.embeddings.create(
  model="text-embedding-ada-002",
  input="Joseph Robinette Biden Jr. (/ˈbaɪdən/ ⓘ BY-dən; born November 20, 1942) is an American politician who is the 46th and current president of the United States. Ideologically a moderate member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.",
  encoding_format="float"
)

In [None]:
embeddings.data[0].embedding

In [168]:
import numpy as np

def cosine_similarity(embedding1, embedding2):
    embedding1 = np.array(embedding1)
    embedding2 = np.array(embedding2)
    
    dot_product = np.dot(embedding1, embedding2)
    norm_embedding1 = np.linalg.norm(embedding1)
    norm_embedding2 = np.linalg.norm(embedding2)
    similarity = dot_product / (norm_embedding1 * norm_embedding2)
    
    return similarity

# For example
embedding1 = embeddings.data[0].embedding
embedding2 = embeddings2.data[0].embedding
print(cosine_similarity(embedding1, embedding2))

0.885412531532096
