In [1]:
import openai

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(),override=True)

os.environ.get('PINECONE_API_KEY')

'79f63113-35b5-4c67-9cce-54c95a104938'

In [3]:
def load_document(file):
     from langchain.document_loaders import PyPDFLoader
     print(f'Loading{file}')
     loader= PyPDFLoader(file)
     data=loader.load()
     return data

In [4]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks= text_splitter.split_documents(data)
    return chunks

In [5]:
def print_embedding_cost(texts):
    import tiktoken
    enc= tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens= sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000*.004:.6f}')

In [6]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store


In [7]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes ... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('Ok')

In [8]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.run(q)
    return answer
    
    
def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history
    

Running Code


In [9]:
data= load_document(r'F:\01. AI\NLP\LLM\2023 End-to-End Handwritten Paragraph Text Recognition.pdf')
print(data[1].page_content)

LoadingF:\01. AI\NLP\LLM\2023 End-to-End Handwritten Paragraph Text Recognition.pdf
also highlight that the reading order is deﬁned by hand, based
on the coordinates of the text regions; this could lead to some
errors in the case of rather slanted lines.
This paper aims at providing a model freed from all of
these constraints. We suggest using a segmentation-free
model that processes whole handwritten paragraphs using
an attention process. The use of a line-segmentation-free
approach has already been proposed for other tasks such as
probabilistic keyword spotting and indexing for example
[2]. However, in [2], the segmentation is superseded by a
ﬁxed sliding window process over the vertical axis. On the
contrary, in the model we propose, character recognition
and implicit line segmentation are learned in an end-to-end
fashion, so as to optimize both processes altogether.
Most of the contributions of the literature have success-
fully used neural networks for line segmentation and text
l

In [10]:
print(len(data))

17


In [11]:

chunks= chunk_data(data)
print(len(chunks))
print(chunks[12])

422
page_content='over, they have rarely been studied and optimized together\nin one single trainable system.\nHistorically, early works have applied segmentation at\ncharacter level and each character was then classiﬁed. Later' metadata={'source': 'F:\\01. AI\\NLP\\LLM\\2023 End-to-End Handwritten Paragraph Text Recognition.pdf', 'page': 0}


In [12]:
print_embedding_cost(chunks)

Total Tokens: 24379
Embedding Cost in USD: 0.097516


In [13]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes ... 
Ok


In [14]:
index_name= 'brailledoc'
vector_store= insert_or_fetch_embeddings(index_name)

Creating index brailledoc and embeddings ...Ok


In [19]:
q = 'summarize the pdf'
answer = ask_and_get_answer(vector_store, q)
print(answer)

The PDF discusses the process of recognizing text in scanned images and highlights the challenge of determining the reading order for slanted lines. The paper aims to provide a model that eliminates such errors. For more information, the reader is directed to the Digital Library at www.computer.org/csdl.
