### Imports


In [1]:
import os
import openai
import tiktoken
import wandb
from pprint import pprint
from getpass import getpass
from wandb.integration.openai import autolog

from pathlib import Path
from pprint import pprint

from rich.markdown import Markdown
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential, # for exponential backoff
)

from dotenv import load_dotenv

In [2]:
from dotenv import load_dotenv
# load in API key from .env file
load_dotenv()

openai.api_key  = os.environ.get('OPENAI_API_KEY')

if not openai.api_key :
    raise ValueError("API key not found. Ensure your .env file is correctly set up.")

In [4]:
# we need a single line of code to start tracing langchain with W&B
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"

# wandb documentation to configure wandb using env variables
# https://docs.wandb.ai/guides/track/advanced/environment-variables
# here we are configuring the wandb project name
os.environ["WANDB_PROJECT"] = "ai_assurance_app"

In [5]:
assert os.getenv("WANDB_PROJECT", "") == "ai_assurance_app", "This doesn't look like a valid W&B project"

In [6]:
MODEL_NAME = "text-davinci-003"

In [None]:
# # load in pdf document using langchain
# import langchain
# lc = langchain()
# lc.add_pdf("Data/guide-to-se-and-p3m-processes.pdf")

In [7]:
# load in html data
from langchain.document_loaders import UnstructuredHTMLLoader

In [None]:
# loader = UnstructuredHTMLLoader("../data/challenges.html")
# loader.load()

In [8]:
# looks like this strips out the html tags, but needs checking
from langchain.document_loaders import DirectoryLoader

def load_docs_from_directory(directory:str, extension:str=".html")->list:
    """
    Loads all documents from a directory with a given extension using langchain
    :param directory:str file path to directory containing documents
    :param extension:str file extension of documents to load
    :return: a list of documents loaded using the loader function
    """
    loader = DirectoryLoader(directory, f"**/*.{extension}")
    return loader.load()

In [9]:
documents = load_docs_from_directory("../data", "html")

In [10]:
# We will need to count tokens in the documents, and for that we need the tokenizer
tokenizer = tiktoken.encoding_for_model(MODEL_NAME)

In [11]:
def count_tokens(documents):
    token_counts = [len(tokenizer.encode(document.page_content)) for document in documents]
    return token_counts

count_tokens(documents)

[985,
 1603,
 1702,
 789,
 666,
 889,
 486,
 1379,
 1320,
 3926,
 1989,
 440,
 1060,
 1736,
 285,
 553,
 789,
 3655,
 1231,
 715,
 1078]

In [12]:
len(documents)

21

In [None]:
## we need to split down the html - determine whether this is possible, or whether converting to text is better (or both)

In [None]:
# from langchain.text_splitter import CharacterTextSplitter
# text_splitter = CharacterTextSplitter(
#     separator = "\n\n",
#     chunk_size = 1000,
#     chunk_overlap  = 200,
#     length_function = len,
#     #is_separator_regex = False,
# )
#
# texts = text_splitter.create_documents(documents)
# print(documents[0])

In [13]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# We will use the OpenAIEmbeddings to embed the text, and Chroma to store the vectors
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(documents, embeddings)

In [None]:
# from langchain.document_loaders import PyPDFLoader
# loader = PyPDFLoader("../data/guide-to-se-and-p3m-processes.pdf")
# pages = loader.load_and_split()

In [None]:
# We will need to count tokens in the documents, and for that we need the tokenizer
tokenizer = tiktoken.encoding_for_model(MODEL_NAME)

In [None]:
# tokenizer

In [None]:
# function to count the number of tokens in each document
def count_tokens(documents):
    token_counts = [len(tokenizer.encode(document.page_content)) for document in documents]
    return token_counts

In [None]:
count_tokens(pages)

### Notes
* what is the maximum number of tokens that can be inputted to the model?
* what is going on here (above)? how did we split the pdf into pages? how will this then be passed to the model?
* how do you inspect your output when doc splitting? (add to design pattern as an unknown)
* interested to know how `pages = loader.load_and_split()` works

In [None]:
# pages[0].page_content


In [None]:
# Markdown(pages[1].page_content)

### Embeddings
* this code use embeddings with a vector database retriever to find relevant documents for a query.
* why vector dbs? why not just numpy?

In [None]:
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores import Chroma
#
# # We will use the OpenAIEmbeddings to embed the text, and Chroma to store the vectors
# embeddings = OpenAIEmbeddings()
# db = Chroma.from_documents(pages, embeddings)

### Notes
* look up what is going on with retrievers and db stores (add to design pattern as an unknown)
* here is a question: what is stored in vector dbs: just the embeddings? or the tokens too?

We can create a retriever from the db now, we can pass the `k` param to get the most relevant sections from the similarity search

In [17]:
retriever = db.as_retriever(search_kwargs=dict(k=3))

In [None]:
# retriever

In [18]:
query = "What are the principles of AI Assurance?"
docs = retriever.get_relevant_documents(query)

[34m[1mwandb[0m: Streaming LangChain activity to W&B at https://wandb.ai/dan-h/ai_assurance_app/runs/cf7j1fzz
[34m[1mwandb[0m: `WandbTracer` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `langchain`.


In [19]:
# Let's see the results
for doc in docs:
    print(doc.metadata["source"])

../data/governance.html
../data/applying-assurance-techniques.html
../data/what-is-assurance.html


### Notes
* look into this: i guess the docs go into the db as embeddings, but retain their link to the source doc? so its interesting to understand how th db works in this regard and what available metadata and normal data is available

In [21]:
for doc in docs:
    print(doc)
    print('/n-------------------/n')

page_content='Table of Contents\n\nCompliance with regulation\n\nManaging risk and building trust\n\nAI assurance services are a distinctive and important aspect of broader AI governance. AI governance covers all the means by which the development, use, outputs and impacts of AI can be shaped, influenced and controlled, whether by the government or by those who design, develop, deploy, buy or use these technologies. AI governance includes regulation but also tools like assurance and standards and statements of principles and practice, often referred to as AI ethics.\n\nRegulation, standards and other statements of principles and practice define what trustworthy AI looks like. Alongside this, AI assurance services provide the ‘infrastructure’ for checking, assessment and verification against these criteria. Assurance services are needed to evaluate and communicate reliable evidence about the trustworthiness of AI systems against the criteria set out by regulations, standards, principles

In [20]:
for doc in docs:
    print(doc.metadata)

{'source': '../data/governance.html'}
{'source': '../data/applying-assurance-techniques.html'}
{'source': '../data/what-is-assurance.html'}


## Stuff Prompt

We'll now take the content of the retrieved documents, stuff them into prompt template along with the query, and pass into an LLM to obtain the answer.

In [22]:
from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

context = "\n\n".join([doc.page_content for doc in docs])
prompt = PROMPT.format(context=context, question=query)

Use langchain to call openai chat API with the question

In [23]:
from langchain.llms import OpenAI

llm = OpenAI()
response = llm.predict(prompt)
Markdown(response)



InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 5530 tokens (5274 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.

### Notes
* How does the prompt generated here differ from the prompt generated using the chain

In [None]:
query = "What are the principles of P3M?"

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

llm = OpenAI()
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
result = qa.run(query)

Markdown(result)

### Notes
* need to look at prompt template more thoroughly
* understand the `retrieval qa chain` more thoroughly

## Gradio app

In [None]:
def retrieval_response(message, history):
    qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
    return qa.run(message)

In [None]:
import random

def random_response(message, history):
    return random.choice(["Yes", "No"])

In [None]:
import gradio as gr

gr.ChatInterface(retrieval_response,
                 title="Ask about P3M",
                chatbot=gr.Chatbot(height=300),
                textbox=gr.Textbox(placeholder="Ask a question about P3M", container=False, scale=7),
                #description="Ask Yes Man any question",
                theme="soft",
                examples=["What are the key principles of P3M?", "What are the limitations of P3M?"],
                cache_examples=True,
                retry_btn=None,
                undo_btn="Delete Previous",
                clear_btn="Clear",
            ).launch()


