In [2]:
import os

from dotenv import load_dotenv, find_dotenv

In [3]:
load_dotenv(find_dotenv(), override=True)

True

In [4]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

### Loading Documents

In [5]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader

        print(f'Loading {file}')

        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader

        print(f'Loading {file}')

        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader

        loader = TextLoader(file)
    else:
        print('Document format is not supported!')

        return None

    data = loader.load()

    return data  

In [6]:
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader

    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()

    return data

### Chunking Data

In [7]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)

    return chunks    

### Calculating Cost

In [8]:
def print_embedding_cost(texts):
    import tiktoken

    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens * 0.02:.6f}')

### Embedding and Uploading to a Vector Database (Pinecone)

In [9]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec

    pc = pinecone.Pinecone()
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')

        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ) 
        )

        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

In [10]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')
    

### Asking and Getting Answers

In [11]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer


### Running Code

#### Ask a PDF

In [12]:
data = load_document('files/us_constitution.pdf')

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 1137 characters in the page


In [13]:
# data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)

In [14]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

In [15]:
chunks = chunk_data(data)

print(len(chunks))

190


In [16]:
print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 334.220000


In [17]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes ... 
Ok


In [18]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index askadocument and embeddings ...Ok


In [19]:
q = 'What is the Bill of Rights?'
answer = ask_and_get_answer(vector_store, q)

print(answer)

{'query': 'What is the Bill of Rights?', 'result': 'The Bill of Rights refers to the first ten amendments to the United States Constitution. It guarantees individual rights, such as freedom of speech, religion, and the right to bear arms, among others.'}


#### While Loop for Asking Questions

In [20]:
import time

i = 1
print('Write Quit or Exit to quit.')

while True:
    q = input(f'Question #{i}: ')
    i = i + 1

    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)

    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.

Answer: {'query': 'Hello', 'result': 'Hello! How can I assist you today?'}

 -------------------------------------------------- 


Answer: {'query': 'What are you doing?', 'result': 'I am providing information related to the Constitution of the United States, based on the text provided. If you have any questions or need clarification, feel free to ask.'}

 -------------------------------------------------- 

Quitting ... bye bye!


#### Ask Wikipedia

In [21]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [22]:
data = load_from_wikipedia('Google Gemini', 'de')
chunks = chunk_data(data)

In [23]:
chunks

[Document(metadata={'title': 'Google Gemini', 'summary': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des Sommers in weiteren Ländern verfügbar wurde. Google Gemini ist in 40 Sprachen verfügbar.', 'source': 'https://de.wikipedia.org/wiki/Google_Gemini'}, page_content='Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des'),
 Document(metadata={'title': 'Google Gemini', 'summary': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkte

In [24]:
index_name = 'gemini'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index gemini and embeddings ...Ok


In [25]:
q = 'Was ist Google Gemini?'
answer = ask_and_get_answer(vector_store, q)

print(answer)

{'query': 'Was ist Google Gemini?', 'result': "I'm not sure."}


In [26]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [35]:
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0, api_key=OPENAI_API_KEY)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [36]:
crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type="stuff",
    verbose=True
)

In [37]:
def ask_question(q, chain):
    return chain.invoke({'question': q})

In [38]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [39]:
data = load_document('files/us_constitution.pdf')
chunks = chunk_data(data)
vector_store = insert_or_fetch_embeddings('us-constitution', chunks)

Loading files/us_constitution.pdf
Creating index us-constitution and embeddings ...Ok


In [40]:
q = "What is the Bill of Rights?"

answer = ask_question(q, crc)

print(answer)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
The
United
States
Constitution
W e
the
People
of
the
United
States,
in
Order
to
form
a
more
perfect
Union,
establish
Justice,
insure
domestic
T ranquility ,
provide
for
the
common
defence,
promote
the
general
W elfare,
and
secure
the
Blessings
of
Liberty

Government
for
a
redress
of
grievances.
Second
Amendment
A
well
regulated
Militia,
being
necessary
to
the
security
of
a
free
State,
the
right
of
the
people
to
keep
and
bear
Arms,
shall
not
be
infringed.
Third
Amendment
No
Soldier
shall,
in
time
of
peace

Bill.
Section
8:
Powers
of
Congress
The
Congress
shall
have
Power
T o
lay
and
collect
T axes,
Duties,
Imposts
and
Excises,
to
pay
the
Debts
and
provide
for
the
common
De

In [41]:
q = "Multiply the number of amendments by 3."

answer = ask_question(q, crc)

print(answer)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: What is the Bill of Rights?
Assistant: The Bill of Rights refers to the first ten amendments to the United States Constitution. These amendments outline specific rights and protections for individuals, such as freedom of speech, religion, and the right to bear arms.
Follow Up Input: Multiply the number of amendments by 3.
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Convention
for
proposing
Amendments,
which,
in
either


# Using Chroma

In [48]:
def create_embeddings_chroma(chunks, persist_directory="./chroma"):
    from langchain_community.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)

    return vector_store

In [58]:
def load_embeddings_chroma(persist_directory="./chroma"):
    from langchain_community.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    return vector_store

In [54]:
data = load_document('files/us_constitution.pdf')
chunks = chunk_data(data)
vector_store = create_embeddings_chroma(chunks)

Loading files/us_constitution.pdf


In [55]:
q = "What is the Bill of Rights?"

answer = ask_and_get_answer(vector_store, q)

print(answer)

{'query': 'What is the Bill of Rights?', 'result': 'The Bill of Rights refers to the first ten amendments to the United States Constitution that outline specific individual rights and freedoms, such as freedom of speech, religion, and the right to a fair trial.'}


In [60]:
db = load_embeddings_chroma()

In [62]:
db.search("What is the Bill of Rights?", search_type='similarity', k=3)

[Document(metadata={'page': 0, 'source': 'files/us_constitution.pdf'}, page_content='The\nUnited\nStates\nConstitution\nW e\nthe\nPeople\nof\nthe\nUnited\nStates,\nin\nOrder\nto\nform\na\nmore\nperfect\nUnion,\nestablish\nJustice,\ninsure\ndomestic\nT ranquility ,\nprovide\nfor\nthe\ncommon\ndefence,\npromote\nthe\ngeneral\nW elfare,\nand\nsecure\nthe\nBlessings\nof\nLiberty'),
 Document(metadata={'page': 0, 'source': 'files/us_constitution.pdf'}, page_content='The\nUnited\nStates\nConstitution\nW e\nthe\nPeople\nof\nthe\nUnited\nStates,\nin\nOrder\nto\nform\na\nmore\nperfect\nUnion,\nestablish\nJustice,\ninsure\ndomestic\nT ranquility ,\nprovide\nfor\nthe\ncommon\ndefence,\npromote\nthe\ngeneral\nW elfare,\nand\nsecure\nthe\nBlessings\nof\nLiberty'),
 Document(metadata={'page': 7, 'source': 'files/us_constitution.pdf'}, page_content='Bill.\nSection\n8:\nPowers\nof\nCongress\nThe\nCongress\nshall\nhave\nPower\nT o\nlay\nand\ncollect\nT axes,\nDuties,\nImposts\nand\nExcises,\nto\npay\nt