<a href="https://colab.research.google.com/github/Daithi333/colab-notebooks/blob/main/query_docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Dependencies

In [None]:
!pip -q install langchain
!pip -q install bitsandbytes accelerate xformers einops
!pip -q install datasets loralib sentencepiece
!pip -q install pypdf
!pip -q install sentence_transformers

In [None]:
!pip -q install chromadb

In [None]:
!pip -q install openai
!pip -q install tiktoken

In [4]:
import os
import sys

import torch
from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

## Load Documents and extract text

In [5]:
!mkdir docs

In [22]:
document = []

for file in os.listdir("docs"):
  path = f"./docs/{file}"
  if file.endswith(".pdf"):
    loader = PyPDFLoader(path)
  elif file.endswith(".docx") or file.endswith(".doc"):
    loader = Docx2txtLoader(path)
  elif file.endswith(".docx") or file.endswith(".doc"):
    loader = TextLoader(path)
  else:
    raise ValueError(f"Unrecognised extension on {file}")

  document.extend(loader.load())

In [13]:
document

In [None]:
len(document)

## Split Document into chunks

In [None]:
document_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
document_chunks = document_splitter.split_documents(document)
len(document_chunks)

In [None]:
document_chunks[0]

## Download Embeddings

In [None]:
# Cost associated with using OpenAI
openai_api_key = input(f"Enter OpenAI key:")
os.environ["OPENAI_API_KEY"] = openai_api_key

embeddings = OpenAIEmbeddings()

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
embeddings

## Set up Chroma Vector Db

In [26]:
vectordb = Chroma.from_documents(document_chunks, embedding=embeddings, persist_directory="./data")

In [27]:
vectordb.persist()

## Using HuggingFace

### Login to HuggingFace Hub to download model

In [None]:
notebook_login()

### Download Llama 2 7B LLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    use_auth_token=True,
    )

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    use_auth_token=True,
    torch_dtype=torch.float16,
    # load_in_8bit=True,
    load_in_4bit=True,
)

Create HuggingFace Pipeline

In [27]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map='auto',
    max_new_tokens=512,
    min_new_tokens=1,
    top_k=30
)

In [45]:
llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': 0})

## Using OpenAI

In [15]:
llm = ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo')

In [None]:
llm

## Create memory object to store conversation history

In [28]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

## Create Conversation retrieval QA chain

The ConversationalRetrievalQA chain builds on RetrievalQAChain to provide a chat history component.

In [29]:
docs_qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectordb.as_retriever(search_kwargs={"k": 6}),
    verbose=False,
    memory=memory
)

In [30]:
result = docs_qa({"question": "Summarise briefly the process for taking an AWS Associate exam"})

In [33]:
result["answer"]

'The provided context does not mention any specific pre-requisites for taking the AWS Certified Solutions Architect - Associate (SAA-C03) exam. However, it is mentioned that the target candidate should have at least 1 year of hands-on experience designing cloud solutions that use AWS services. It is always recommended to review the official AWS certification website for the most up-to-date information on any pre-requisites or requirements for the exam.'

In [32]:
print("---------------------------------")
print("Welcome to the Document Query bot")
print("---------------------------------")

while True:
  query = input(f"Prompt:")
  if query in ["exit", "quit" "q", "f"]:
    print("Exiting")
    sys.exit()
  if query == "":
    continue
  result = docs_qa({"question": query})
  print(f"Answer: {result['answer']}")

---------------------------------
Welcome to the Document Query bot
---------------------------------
Prompt:How much experience is required to take the AWS Solutions Architect exam?
Answer: The required level of experience to take the AWS Solutions Architect - Associate (SAA-C03) exam is at least 1 year of hands-on experience designing cloud solutions that use AWS services.
Prompt:What other pre-requisites are there?
Answer: The provided context does not mention any specific pre-requisites for taking the AWS Certified Solutions Architect - Associate (SAA-C03) exam. However, it is mentioned that the target candidate should have at least 1 year of hands-on experience designing cloud solutions that use AWS services. It is always recommended to review the official AWS certification website for the most up-to-date information on any pre-requisites or requirements for the exam.
Prompt:exit
Exiting


SystemExit: ignored