# sageBot Pipeline
### This notebook is the a pipeline for team sageBot in particition of the Tunga AI Hackathon

In [None]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
llm_model = "gpt-3.5-turbo-0301" # llm model

### Document to perform Q&A 
### AWS Lambda Function Documentation is use for this pipeline

In [None]:
# Download document  over a url
from urllib.request import urlretrieve
url = ("https://primarywater.blob.core.windows.net/tunga/lambda-dg-1-500.pdf")
filename = "document.pdf"

urlretrieve(url, filename)

In [None]:
# load document
from langchain.document_loaders import PyPDFLoader
file = "document.pdf"
docs = PyPDFLoader(file)

docs = docs.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# create a text splitter instance
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs) # splits the docs into chunks


In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [None]:
# path to database/vectorstores
persist_directory = 'chroma/'

!rm -rf ./chroma  # remove old database files if any

In [None]:
from langchain.vectorstores import Chroma

# generate embeddings for our document
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)


vectordb.persist() # save embeddings

In [None]:
# Load document embeddings from a vectorstores db
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [None]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer try make up the up an answer as accurate as possible. If there is code enclose it in triple backtics. Always say "I am an experiment, my answers may be inaccurate at the end.
{context}
Question: {question}
Helpful Answer:"""

print(template)

In [None]:
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,) # prompt template

In [None]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_model, temperature=0)  # loading llm model

In [None]:
# create instance for memory

from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
# create a retrieval chain for Q&A

from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    chain_type = "stuff",
    retriever=retriever,
    memory=memory
)

In [None]:
# Question 1
question = "Title of the context given to you"
result = qa({"question": question})

display(Markdown(result["answer"]))

In [None]:
# Question 1

question = "Give a brief summary of the context"
result = qa({"question": question})

In [None]:
# Question 1
question = "Using the context, summarise lambda in simple terms please."
result = qa({"question": question})

display(Markdown(result["answer"]))