In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
pip install pypdf -q

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install docx2txt -q

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
pip install wikipedia -q

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
def load_document(file):
    
    import os
    name, extensions = os.path.splitext(file)
    
    if extensions == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading{file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading{file}')
        loader = Docx2txtLoader(file)
    else:
        print("Document not valid, please introduce only PDF'S or Docx formats")
        return None
    
    data = loader.load()
    return data

def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [6]:
def chunk_data(data,chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [7]:
def print_embedding_cost(text):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in text])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/ 1000*0.004:.6f}')

### Embedding and Uploading to a Vector Database(Pinecone)

In [23]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store

In [9]:
def delete_pinecone_index(index_name='all'):#the pinecone free only supports 1 index so migth be necessery to delete it frecuently
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print(f'Deleting all indexes...')
        for index in indexes:
            pinecone.delete_index(index)
        print('ok')
    else:
        print(f'Deleting index {index_name} ...',end='')
        pinecnone.delete_index(index_name)
        print('ok')

In [31]:
def ask_and_get_answer(vector_store, q):
    
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.run(q)
    return answer

def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history
    

### Running code

In [11]:
data = load_document('document.pdf')
print(data[1].page_content)
print(data[3].metadata)
print(f'you have {len(data)} in your data')

Loadingdocument.pdf
.ࠂReinforcement Learning
Introduction
Imagine you are lost in a labyrinth and have to ˪nd your way out. As you are there for
the ˪rst time, you do not know which way to choose to reach the door to leave. More-
over, there are dangerous ˪elds  on the labyrinth and you should avoid stepping on
them.
You will have four actions you can perform: move up, down, left, or right. As you do not
know the labyrinth, the only way to ˪nd your way out is to see what happens when you
perform random actions. Within the learning process, you will ˪nd out that there are
˪elds  on the labyrinth that will reward you by letting you escape the labyrinth. How-
ever, there are also ˪elds  where you will receive a negative reward as they are danger-
ous to step on. After some time, you will manage to ˪nd your way out without stepping
on the dangerous ˪elds  from the experience you have made walking around. This proc-
ess of learning by reward is called reinforcement learning.
In this unit, y

In [12]:
data_2 = load_from_wikipedia('GPT-4')
print(data_2[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was initially released on March 14, 2023, and has been made publicly available via the paid chatbot product ChatGPT Plus, and via OpenAI's API.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4 is also capable of taking images as input, though this feature has not been made available since launch. OpenAI has declined to reveal various technical details 

In [13]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[11].page_content)

71


In [14]:
print_embedding_cost(chunks)

Total Tokens: 3628
Embedding Cost in USD: 0.014512


In [15]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes...
ok


In [16]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index askadocument and embeddings ...ok


In [17]:
q = 'Make a 3 lines resume about the document'
answer = ask_and_get_answer(vector_store, q)
print(answer)

This document introduces the principles of reinforcement learning and explains Markov decision processes. It also covers the Q-learning algorithm. Completing this unit will enable understanding and application of these concepts in the field of reinforcement learning.


In [36]:
import time
i = 1
print('Write Quit or Exit to quit')

while True:
    q=input(f'Question #{i} ')
    i = i+1
    if q.lower() in ['quit', 'exit']:
        print('Quitting....bye bye!')
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}') #This and the line after is just for readibility
    print(f'\n {"-" * 50}\n')

Write Quit or Exit to quit
Question #1 Make a summary of this document

Answer: I apologize, but I don't have access to the document you're referring to. Could you please provide the document's content or give me more information about it?

 --------------------------------------------------

Question #2 quit
Quitting....bye bye!


In [19]:
delete_pinecone_index()

Deleting all indexes...
ok


In [26]:
data = load_from_wikipedia('ChatGPT', 'es')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name)

Index chatgpt already exists. Loading embeddings ... Ok


In [28]:
q = 'Que es chat-gpt3'
answer = ask_and_get_answer(vector_store, q)
print(answer)

Chat GPT-3 es un modelo de lenguaje de inteligencia artificial desarrollado por OpenAI. GPT-3 significa "Generative Pre-trained Transformer 3", y se refiere a la tercera generación de modelos GPT. Es conocido por su capacidad para generar texto coherente y natural en respuesta a preguntas o instrucciones dadas por los usuarios. GPT-3 ha demostrado ser muy poderoso y versátil en una amplia gama de aplicaciones, desde asistencia en la escritura hasta generación de código.


In [32]:
# asking with memory
chat_history = []
question = 'Wat is the markov property?'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

The Markov property is a key characteristic of Markov processes. It states that the future state of a process is independent of its past history, given its present state. In other words, the probability distribution of future states depends only on the current state and not on how the system arrived at that state.
[('Wat is the markov property?', 'The Markov property is a key characteristic of Markov processes. It states that the future state of a process is independent of its past history, given its present state. In other words, the probability distribution of future states depends only on the current state and not on how the system arrived at that state.')]


In [35]:
question = 'Make it shorter'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

The Markov property states that the future state of a system depends only on its current state, and not on the past states or how the system arrived at that state.
[('Wat is the markov property?', 'The Markov property is a key characteristic of Markov processes. It states that the future state of a process is independent of its past history, given its present state. In other words, the probability distribution of future states depends only on the current state and not on how the system arrived at that state.'), ('Make it shorter', 'The Markov property states that the future state of a system depends only on its current state, and not on the past states or how the system arrived at that state.')]
