###1.Set up enviroment

####Install packages

In [1]:
!pip install orjson==3.9.14
!pip install pdfplumber==0.9.0
!pip install unidecode
!pip install groq
!pip install ollama
!pip install pinecone-client
!pip install langchain
!pip install langchain_community
!pip install langchain_pinecone
!pip install langchain_groq
!pip install python-dotenv
!pip install pymupdf
!pip install colab-xterm

Collecting orjson==3.9.14
  Downloading orjson-3.9.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading orjson-3.9.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/139.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.0/139.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: orjson
Successfully installed orjson-3.9.14
Collecting pdfplumber==0.9.0
  Downloading pdfplumber-0.9.0-py3-none-any.whl.metadata (35 kB)
Collecting pdfminer.six==20221105 (from pdfplumber==0.9.0)
  Downloading pdfminer.six-20221105-py3-none-any.whl.metadata (4.0 k

####Connect servers


In [17]:
# Setting up ollama server
%load_ext colabxterm
!curl -fsSL https://ollama.com/install.sh | sh
!ollama serve > ollama.log 2>&1 &
!ollama pull nomic-embed-text

The colabxterm extension is already loaded. To reload it, use:
  %reload_ext colabxterm
>>> Downloading ollama...
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕▏ 274 MB                         
pulling c71d239df917... 100% ▕▏  11 KB                         
pulling ce4a164fc046... 100% ▕▏   17 B                         
pulling 31df23ea7daa... 100% ▕▏  420 B                         
verifying sha256 digest 
writing manifest ⠋ [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G

###2.Import required tools

####Warnings

In [3]:
import warnings
warnings.filterwarnings("ignore")

####Tools

In [4]:
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from unidecode import unidecode
from langchain.schema import SystemMessage
from pinecone import Pinecone
from pinecone import Vector
from tqdm import tqdm
import pdfplumber
import os

###3.Set up API Keys

In [5]:
# Define up required API Keys
os.environ["GROQ_API_KEY"] = "<Insert you groq api key here>"
os.environ["PINECONE_API_KEY"] = "<Insert you pinecone api key here>"

# Set up required API Keys
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

###4.Create database

####Utilities

In [6]:
# Function to load pdf documents
def load_document(docs_dir, doc_name):
  # create a document path
  doc_path = os.path.join(docs_dir, doc_name)
  # load pdf file and extract text
  document = []
  with pdfplumber.open(doc_path) as pdf:
      for page in pdf.pages:
          document.append(page.extract_text())
  # join all pages' text into a single string
  text = "\n".join(document)
  return text


# Function to chunk a document
def split_document(text):
  # define a splitter strategy
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
  # split the text document into chunks
  chunks = text_splitter.split_text(text)
  return chunks


# Function to embed text from a document pre-chunked
def embed_document(chunks):
  # define an embedding model
  embedding_model = OllamaEmbeddings(model='nomic-embed-text')
  # embed the text
  embeddings = embedding_model.embed_documents(chunks)
  return embeddings


# Function to create pinecone vectos
def create_vectors(doc_name, chunks, embeddings):
  # array to save vectors
  vectors = []
  # create a Vector instance
  for i, chunk in enumerate(chunks):
    vector_id = f"{i}-{unidecode(doc_name)}"
    vector = Vector(
        id=vector_id,
        values=embeddings[i],
        metadata={"source": unidecode(doc_name), "text": chunk}
    )
    # save vector
    vectors.append(vector)
  return vectors


# Function to upser a document vectors into a pinecone DB
def upsert_document(doc_name, vectors):
  # connect to a pinecone DB
  pinecone_client = Pinecone(apikey=PINECONE_API_KEY)
  # get a pre-existed pinecone index
  pinecone_index = pinecone_client.Index(name='test')
  # define a batch size
  batch_size = 1000
  # define the total number of vectors
  total_vectors = len(vectors)
  # upsert all vectors
  for i in tqdm(range(0, total_vectors, batch_size), desc=f"Upserting {doc_name}"):
    batch = vectors[i:i+batch_size]
    pinecone_index.upsert(
        vectors=batch,
        show_progress=False
                  )
  print(f'Document {doc_name} was succesfully loaded!')

####Database

In [7]:
# Function to process documents and uploading them into a DB
def create_database(docs_dir):
  # header
  print(" 🗃️Loading documents...")
  print('')
  # get documents list of pdf documents
  docs_names = [name for name in os.listdir(docs_dir) if name.lower().endswith('.pdf')]
  # process all documents
  for i, name in enumerate(docs_names):
    # load document
    text = load_document(docs_dir=docs_dir, doc_name=name)
    # create chunks
    chunks = split_document(text=text)
    # convert chunks into embeddingss
    embeddings = embed_document(chunks=chunks)
    # create vectors instances
    vectors = create_vectors(doc_name=name, chunks=chunks, embeddings=embeddings)
    # upload documentos into a pinecone DB
    upsert_document(doc_name=name, vectors=vectors)
  # footer
  print('')
  print("Your documents were succesfully loaded!")
  print('')


###5.Create chatbot

####Utils


In [8]:
# Function to load a LLM
def load_llm():
  # create llm instance
  llm = ChatGroq(
  groq_api_key=GROQ_API_KEY,
  model_name='llama-3.1-8b-instant',
  temperature=0.25,
  max_tokens=1024
  )
  return llm


# Function to create a vectorstore from a pinecone DB
def create_vectorstore():
  # connect to a pinecone DB
  pinecone_client = Pinecone(apikey=PINECONE_API_KEY)
  # get a pre-existed pinecone index
  pinecone_index = pinecone_client.Index(name='test')
  # define an embedding model
  embedding_model = OllamaEmbeddings(model='nomic-embed-text')
  # create a vectorstore
  vectorstore = LangchainPinecone(pinecone_index, embedding_model.embed_query, "text")
  return vectorstore


# Function to create a memory instance
def create_memory():
  # configure memory object
  memory = ConversationBufferMemory(memory_key="chat_history", output_key='answer', return_messages=True)
  return memory


# Function to create a refined prompt
def create_prompt(memory):
  # define an initial system instruction
  system_instruction = "Your name is Boto and you are my personal AI assistant, you were created to help in anything. You always respond in English."
  # create initial system message instance
  initial_message = SystemMessage(content=system_instruction)
  # add initial system message to memory
  memory.chat_memory.add_message(initial_message)
  # system prompt template
  system_template = (
    f"{system_instruction}\n\n"
    "Based on the following retrieved information, chat history, and user query, provide comprehensive and accurate responses."
  )
  # human prompt template
  human_template = (
    "retrieved information: {context}\n"
    "chat history: {chat_history}\n"
    "user query: {question}\n"
    "Detailed answer:"
  )
  # create system prompt instance
  system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
  # create human prompt instance
  human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
  # create convesation chain prompt
  qa_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
  return qa_prompt


def create_qa_chain(llm, vectorstore):
  # initialize a memory
  memory = create_memory()
  # create a prompt
  qa_prompt = create_prompt(memory=memory)
  # create a qa conversation chain
  qa_chain = ConversationalRetrievalChain.from_llm(
      llm=llm,
      return_source_documents=True,
      retriever=vectorstore.as_retriever(),
      memory=memory,
      verbose=False,
      chain_type='stuff',
      output_key='answer',
      combine_docs_chain_kwargs={"prompt": qa_prompt}
  )
  return qa_chain


# display chat header
def header():
  print('')
  print('Hey! (Type "exit" to finish this conversation)')
  print('')


# user query
def user_query():
  query = input("😎You: ")
  return query


# exit message
def exit_query(query):
  if query.lower() == "exit":
    print("See you soon!")
    print('')
    return True


# Function to handle queries
def handle_query(query, qa_chain):
  # get query result
  result = qa_chain.invoke({"question": query})
  # extract assistant answer
  answer = result['answer']
  # extract unique source documents names
  src_documents = set([doc.metadata['source'].split('.')[0] for doc in result['source_documents']])
  return answer, src_documents


# get assistant answer
def assistant_answer(answer, src_documents):
  # display assistant answer
  print("🤖 Assistant:", answer)
  # display source documents
  print('')
  print('Reference documents:')
  for doc in src_documents:
    print(doc)



####chat

In [9]:
def chat():
    # load a LLM (default:Llama 3.1)
    llm = load_llm()
    # initialize a vectorstore
    vectorstore = create_vectorstore()
    # initialize a QA chain
    qa_chain = create_qa_chain(llm=llm, vectorstore=vectorstore)
    # display chat header
    header()
    # conversation logic
    while True:
        # get user query
        query = user_query()
        # check if user wants to exit
        if exit_query(query):
            break
        # get chat answer
        answer, src_documents = handle_query(query=query, qa_chain=qa_chain)
        # display bot answer
        assistant_answer(answer=answer, src_documents=src_documents)
        print('')

###6.App

####Utils

In [10]:
# Function to give a welcome message
def welcome():
  # give an app welcome message
  print("Welcome! my name is Boto, I am your virtual assistant🤖")


# Function to display an app menu
def options():
  # define app options
  options = ['Load documents', 'Chat with Boto', 'Exit']
  # display options
  for number, option in enumerate(options, start=1):
    print(f'{number}. {option}')
  # app menu logic
  while True:
    # get uset option selection
    answer = input('What would you like to do?')
    # validate user option is a number within options menu
    try:
      answer = int(answer)
      # verify user option is available
      if answer == 1 or answer == 2 or answer == 3:
        return answer
      else:
        print("Type (1) or (2) to select and option and (3) to exit")
    # catch error
    except ValueError as e:
      print("Type (1) or (2) to select and option and (3) to exit")



####app

In [11]:
def app():
    # welcome message
    welcome()
    # app menu manager logic
    while True:
        # display options menu
        option = options()
        if option == 1:
          # define directory where documents are storaged
          docs_dir = '/content/documents/'
          # create pinecon db
          create_database(docs_dir=docs_dir)
        elif option == 2:
          # chat with boto
          chat()
        else:
          # exit
          break

###7.Run app

In [19]:
# run Boto-AI assistant app
app()

Welcome! my name is Boto, I am your virtual assistant🤖
1. Load documents
2. Chat with Boto
3. Exit
What would you like to do?1
 🗃️Loading documents...



Upserting cv_RigobertoRinconBallesteros.pdf: 100%|██████████| 1/1 [00:00<00:00,  7.78it/s]


Document cv_RigobertoRinconBallesteros.pdf was succesfully loaded!

Your documents were succesfully loaded!

1. Load documents
2. Chat with Boto
3. Exit
What would you like to do?2

Hey! (Type "exit" to finish this conversation)

😎You: Hello, how are you today?
🤖 Assistant: Hello! I'm Boto, your personal AI assistant. I'm functioning within normal parameters and ready to assist you with any questions or tasks you may have. It's great to start our conversation today! How can I help you?

Reference documents:
cv_RigobertoRinconBallesteros

😎You: do you know Rigoberto Rincón?
🤖 Assistant: I have access to information about Rigoberto Rincón Ballesteros. According to the retrieved information, Rigoberto Rincón is a biochemical engineer with a strong background in quality control laboratories and biotechnological research. He has experience working in multidisciplinary teams and has skills in programming languages such as Python, Matlab, and SQL basics, as well as version control using Git a