# ENV Dependencies

In [1]:
# TGI ependencies
!pip install -q text-generation \
                langchain \
                transformers
# Chatbot dependencies
!pip install -q streamlit \
                streamlit-chat \
                langchain \
                langchainhub \
                huggingface_hub \
                transformers \
                pypdf \
                sentence_transformers \
                chromadb \
                tiktoken
# Streamlit UI Dependencies
!npm install -q localtunnel


[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
updated 1 package and audited 36 packages in 0.454s

3 packages are looking for funding
  run `npm fund` for details

found 2 [93mmoderate[0m severity vulnerabilities
  run `npm audit fix` to fix them, or `npm audit` for details
[K[?25h

# Ingest.py File
Script for ingesting PDF and converting it into vector database

In [2]:
%%writefile ingest.py

# Ingest packages
import os
import torch
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain import HuggingFaceHub


# Tokenizer
embedd_model = 'BAAI/bge-reranker-large'
model_kwargs = {"device": 'cuda'}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=embedd_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

def ingest_doc(doc_path, file_name):

    # Checking if vector database exists, creating it if not
    outdir = "./backend/vector_databases/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Creating database path
    db_path = os.path.join(outdir, file_name)
    print('Db Path: ', db_path)

    # Checking if the database already exists, and creating it if it doesn't
    if not os.path.exists(db_path):
        # Loading doc
        loader = PyPDFLoader(doc_path)
        raw_doc = loader.load()

        # Split and store vectors
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                    chunk_overlap=0,
                                                    separators=["\n\n", "\n", " ", ""])
        all_splits = text_splitter.split_documents(raw_doc)


        # Creating vector store
        vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory=db_path)
    else:
        vectorstore = Chroma(persist_directory=db_path, embedding_function=embeddings)

    return vectorstore

def create_doc_obj(doc_path, file_name):

    # Checking if vector database exists, creating it if not
    outdir = "./backend/vector_databases/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Creating database path
    db_path = os.path.join(outdir, file_name)
    print('Db Path: ', db_path)

    # Creating document object
    loader = PyPDFLoader(doc_path)
    raw_doc = loader.load()

    return raw_doc


Overwriting ingest.py


# Core.py File
Script for querrying the LLM and returning response/source documents

In [3]:
%%writefile core.py


from typing import Any, List, Dict

#Summary and checklist packages
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
# Chat packages
import torch
import os
from langchain.llms import HuggingFaceTextGenInference
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.callbacks import streaming_stdout
from langchain.chains.question_answering import load_qa_chain
from langchain import hub


# Global variables
huggingfacehub_api_token = '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'
zephyr_repo = 'HuggingFaceH4/zephyr-7b-beta'
rag_prompt = hub.pull("rlm/rag-prompt-mistral")


# TGI URL
inference_url = "https://wp021uax7a.execute-api.us-west-2.amazonaws.com/default/mistralrequest"

# Tokenizer
embedd_model = 'BAAI/bge-reranker-large'
model_kwargs = {"device": 'cuda'}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=embedd_model, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


# Chat LLM
chat_llm = HuggingFaceTextGenInference(
    inference_server_url=inference_url,
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
)


# Summarization and checklist LLM
sum_check_llm = HuggingFaceTextGenInference(
    inference_server_url=inference_url,
    max_new_tokens=1000,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
)


# Call LLM for summary
def run_llm_summarize(document_object: Any):

    docs = document_object

    # Map
    map_template = """<s> [INST] The following is a collection of excerpts from a compliance document:[/INST] </s>
    {docs}
    [INST] Based on the provided excerpts, summarize the main theme.
    Helpful Answer:[/INST]"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=sum_check_llm, prompt=map_prompt)

    # Reduce
    reduce_template = """<s> [INST] The following is set of summaries:[/INST] </s>
    {doc_summaries}
    [INST] Take these and distill it into a final, consolidated summary. Ensure the final output is concise and easy to read.
    Helpful Answer:[/INST]"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=sum_check_llm, prompt=reduce_prompt)

    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )

    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=4000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0)

    split_docs = text_splitter.split_documents(docs)

    summary = map_reduce_chain.run(split_docs)

    return summary


# Call LLM for checklist
def run_llm_checklist(document_object: Any):

    docs = document_object

    # Map
    map_template = """<s> [INST] The following is a collection of guidance from a compliance document:[/INST] </s>
    {docs}
    [INST] Based on the provided guidance, please create a list of suggestions.
    Helpful Answer:[/INST]"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=sum_check_llm, prompt=map_prompt)

    # Reduce
    reduce_template = """<s> [INST] The following is a colection of suggestions from a compliance document:[/INST] </s>
    {doc_summaries}
    [INST] Take these and distill them into a final, consolidated list of suggestions to comply with the guidance provided in the document.
    Helpful Answer:[/INST]"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=sum_check_llm, prompt=reduce_prompt)

    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )

    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=4000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0)

    split_docs = text_splitter.split_documents(docs)

    suggestion_list = map_reduce_chain.run(split_docs)

    return suggestion_list


# Call LLM for chat
def run_llm_chat(vector_database: Any, question: str):

    # Vector DB retriever
    retriever = vector_database.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .8, 'k': 10,})
    docs = retriever.get_relevant_documents(question)

   # Chain
    chain = load_qa_chain(chat_llm, chain_type="stuff", prompt=rag_prompt)
    # Run
    response = chain({"input_documents": docs, "question": question}, return_only_outputs=False)

    output = response['output_text']

    sources = [doc.metadata['page'] for doc in response['input_documents']]
    sources.sort()

    return output, sources


Overwriting core.py


# Utilities

In [4]:
%%writefile utilities.py


import os
import re
import streamlit as st
from typing import Set
from core import run_llm_summarize, run_llm_checklist

####################
# Utility functions
####################

# Function to list files in upload directory
def list_files():
     # Checking if the uploads directory exists, and create it if it doesn't
    outdir = "./backend/uploads/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    return [f for f in os.listdir(outdir) if os.path.isfile(os.path.join(outdir, f))]

# Saving a copy of PDF for vectorization
def save_upload(file):
    file_name = file.name

    # Checking if the uploads directory exists, and create it if it doesn't
    outdir = "./backend/uploads/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Checking if the file already exists, and saving it if it doesn't
    file_path = os.path.join(outdir, file_name)
    if not os.path.exists(file_path):
        # Saving the file
        with open(os.path.join(outdir, file_name), "wb") as f:
            f.write(file.read())

    return file_path, file_name

# Return response sources in formatted string
def create_sources_string(source_urls: Set[str]) -> str:
    if not source_urls:
        return ""
    sources_list = list(source_urls)
    sources_list.sort()
    sources_string = "Pulled from pages:\n"
    for i, source in enumerate(sources_list):
        sources_string += f" {source},"
    return sources_string

# Return file name for subheadder
@st.cache_resource()
def clean_name(doc_name):

    cleaned_name = re.sub(r'.pdf', '', doc_name, flags=re.IGNORECASE)
    cleaned_name = re.sub(r'\.', ' ', cleaned_name)
    return cleaned_name

# Creating or loading summarization
@st.cache_data(show_spinner="Hey! 🤖👋 I'm diving into every page of your document to craft your summary. Depending on how many pages there are, this might take a few minutes. It's the perfect moment to grab yourself a coffee and relax for a bit!")
def create_or_load_summ(_doc_object, doc_name):
    # Checking if the uploads directory exists, and create it if it doesn't
    outdir = "./backend/summary/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    file_name = re.sub(r'.pdf', '.txt', doc_name, flags=re.IGNORECASE)
    file_path = "./backend/summary/"+file_name
    # Creating/saving summary if it doesn't exist
    if not os.path.exists(file_path):
        # Generating summary
        summary = run_llm_summarize(document_object=_doc_object)
        # Saving to file
        with open(file_path, 'w') as file:
            file.write(summary)
        return summary
    # Loading saved summary
    else:
        with open(file_path, 'r') as file:
              summary = file.read()
        return summary

# Creating or loading checklist
@st.cache_data(show_spinner="Hi there! 🤖👋 I'm currently compiling a list of suggestions based on each page in your document to create your personalized checklist. Depending on the number of pages, this process might take a little while. Feel free to take a break and grab a coffee while I work on this for you!")
def create_or_load_checklist(_doc_object, doc_name):
    # Checking if the uploads directory exists, and create it if it doesn't
    outdir = "./backend/checklist/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    file_name = re.sub(r'.pdf', '.txt', doc_name, flags=re.IGNORECASE)
    file_path = "./backend/checklist/"+file_name
    # Creating/saving checklist if it doesn't exist
    if not os.path.exists(file_path):
        # Generating checklist
        checklist = run_llm_checklist(document_object=_doc_object)
        # Saving to file
        with open(file_path, 'w') as file:
            file.write(checklist)
        return checklist
    # Loading saved checklist
    else:
        with open(file_path, 'r') as file:
              checklist = file.read()
        return checklist

Overwriting utilities.py


# App.py File
Script for running the streamlit interface

In [5]:
%%writefile app.py

import re
import time
import os
import streamlit as st
from typing import Set
from streamlit_chat import message

from ingest import ingest_doc, create_doc_obj
from utilities import (list_files,
                       save_upload,
                       create_sources_string,
                       clean_name,
                       create_or_load_summ,
                       create_or_load_checklist
                       )
from core import run_llm_summarize, run_llm_checklist, run_llm_chat


####################
# Utility functions
####################

# # Function to list files in upload directory
# def list_files():
#      # Checking if the uploads directory exists, and create it if it doesn't
#     outdir = "./backend/uploads/"
#     if not os.path.exists(outdir):
#         os.makedirs(outdir)

#     return [f for f in os.listdir(outdir) if os.path.isfile(os.path.join(outdir, f))]

# # Saving a copy of PDF for vectorization
# def save_upload(file):
#     file_name = file.name

#     # Checking if the uploads directory exists, and create it if it doesn't
#     outdir = "./backend/uploads/"
#     if not os.path.exists(outdir):
#         os.makedirs(outdir)

#     # Checking if the file already exists, and saving it if it doesn't
#     file_path = os.path.join(outdir, file_name)
#     if not os.path.exists(file_path):
#         # Saving the file
#         with open(os.path.join(outdir, file_name), "wb") as f:
#             f.write(file.read())

#     return file_path, file_name

# # Return response sources in formatted string
# def create_sources_string(source_urls: Set[str]) -> str:
#     if not source_urls:
#         return ""
#     sources_list = list(source_urls)
#     sources_list.sort()
#     sources_string = "Pulled from pages:\n"
#     for i, source in enumerate(sources_list):
#         sources_string += f" {source},"
#     return sources_string

# # Return file name for subheadder
# @st.cache_resource()
# def clean_name(doc_name):

#     cleaned_name = re.sub(r'.pdf', '', doc_name, flags=re.IGNORECASE)
#     cleaned_name = re.sub(r'\.', ' ', cleaned_name)
#     return cleaned_name

####################
# Global Variables
####################

# Creating list of available saved documents
saved_docs = list_files()


####################
# Streamlit interface
####################

import streamlit as st
import time

title = st.empty()
sub_header = st.empty()

title.title('Welcome to Sibyl 🤖')
sub_header.subheader('Your AI assistant for document review!')

if (
    "chat_answers_history" not in st.session_state
    and "user_prompt_history" not in st.session_state
    and "chat_history" not in st.session_state
):
    st.session_state["chat_answers_history"] = []
    st.session_state["user_prompt_history"] = []
    st.session_state["chat_history"] = []


# Function for selecting saved document and returning vectorized database
@st.cache_resource(show_spinner='Pulling document from database...')
def select_document_sidebar(file):
    if file_selected:
        outdir = './backend/uploads/'
        file_path = os.path.join(outdir, file)
        loading_message_container = st.empty()
        loading_message_container.info('Hangtight while I search for the document...', icon="🔎")
        vectore_store = ingest_doc(file_path, file)
        raw_docs = create_doc_obj(file_path, file)
        loading_message_container.empty()
        return True, vectore_store, raw_docs
    return False, None, None

# Function for uploading and vectorizing document
@st.cache_resource(show_spinner='Processing the document...')
def upload_document_sidebar(file):
    if file_uploaded:
        file_path, file_name = save_upload(file)
        loading_message_container = st.empty()
        loading_message_container.info("Hangtight, I'm giving the document a quick translation into a computer-friendly language. This shouldn't take more than a minute!",
                                       icon="📑")
        vectore_store = ingest_doc(file_path, file_name)
        raw_docs = create_doc_obj(file_path, file_name)
        loading_message_container.empty()
        return True, vectore_store, raw_docs
    return False, None, None



# Sidebar for selecting/uploading document
upload_placeholder = st.empty()

with upload_placeholder.info(" 👈 Select document or upload your own to start chat"):
    st.sidebar.header("Select a File or Upload New Document")
    with st.sidebar:

        sidebar_completed = False
        st.session_state.vectore_store = None

        # Adding empty line for spacing
        st.markdown("")

         # Radio button for user confirmation with agreement link
        agreement_link = "[User Agreement](https://google.com)"
        user_confirmation = st.checkbox(label=f"I confirm that I have read and understood the {agreement_link}.")

        # Adding empty line for spacing
        st.markdown("")

        if user_confirmation:  # User confirmed, allow document selection/upload

            document_selection = st.radio("Would you like to upload a document or select a saved document?",
                             ["Upload", "Select"],
                             captions = ["Load a new document", "Browse preprocessed documents"])

            # Adding empty line for spacing
            st.markdown("")

            if document_selection == "Upload":
                # Widget to upload new document
                file_uploaded = st.file_uploader("Upload your PDF file", type="pdf", key='FileUpload')
                upload_sidebar_completed, uploaded_vectore_store, raw_document_object = upload_document_sidebar(file_uploaded)

                if file_uploaded:
                    doc_name = file_uploaded.name
                    # Successful message
                    message_container = st.empty()
                    message_container.success('Document processed successfully!', icon="✅")
                    st.session_state.message_container = message_container

                    # Changing control variable to enable chatting
                    st.session_state.vectore_store = uploaded_vectore_store
                    st.session_state.document_object = raw_document_object
                    sidebar_completed = upload_sidebar_completed
                    # upload_placeholder.empty()

            elif document_selection == "Select":
                # Create a dropdown menu in the sidebar for file selection
                file_selected = st.sidebar.selectbox(label="Select a File", options=saved_docs, placeholder='Choose an option', index=None )
                select_sidebar_completed, selected_vectore_store, raw_document_object = select_document_sidebar(file_selected)

                if file_selected:
                    doc_name = file_selected
                    # Successful message
                    message_container = st.empty()
                    message_container.success('Document loaded!', icon="✅")
                    st.session_state.message_container = message_container

                    # Changing control variable to enable chatting
                    st.session_state.vectore_store = selected_vectore_store
                    st.session_state.document_object = raw_document_object
                    sidebar_completed = select_sidebar_completed
                    # upload_placeholder.empty()
            else:
                st.info("Let's pick a document to review!", icon="☝️")


# Summarize or chat selection
if sidebar_completed:
    title.title('Sibyl 🤖 is ready to go!')
    sub_header.subheader(f"Let's talk about {clean_name(doc_name)}")
    upload_placeholder.info("I've got your document ready!", icon="👇")
    st.divider()
    st.write("Get a summary or a personalized checklist of action items by simply clicking the buttons provided below.")

    col1, col2 = st.columns(2)
    with col1:
        summarize = st.button("Summarize")
    with col2:
        quick_list = st.button("Quick Guidance")



# Summarizing document
if "vectore_store" in st.session_state is not None and sidebar_completed and summarize:
    summary = create_or_load_summ(_doc_object=st.session_state.document_object, doc_name=doc_name)
    st.write(summary)

# Checklist list
if "vectore_store" in st.session_state is not None and sidebar_completed and quick_list:
    checklist = create_or_load_checklist(_doc_object=st.session_state.document_object, doc_name=doc_name)
    st.write(checklist)


# Starting chat
if "vectore_store" in st.session_state is not None and sidebar_completed:
    vectore_store = st.session_state.vectore_store

    # Adding empty line for spacing
    st.markdown("")
    st.caption("Start chatting by entering your question in the query queue at the bottom of the page!")
    st.divider()
    prompt = st.chat_input(placeholder="Enter your question here...")

    if prompt:
        with st.spinner("Searching document for the answer..."):
            generated_response, sources = run_llm_chat(vector_database=vectore_store, question=prompt)
            formatted_response = (f"{generated_response} \n\n {create_sources_string(set(sources))}")

        st.session_state.chat_history.append((prompt, generated_response))
        st.session_state.user_prompt_history.append(prompt)
        st.session_state.chat_answers_history.append(formatted_response)


    # Displaying generated response with unique keys
    if st.session_state["chat_answers_history"]:
        for generated_response, user_query in zip(
            st.session_state["chat_answers_history"],
            st.session_state["user_prompt_history"],
        ):
            with st.chat_message("user", avatar="🤔"):
                st.write(user_query)
            with st.chat_message("ai", avatar="🤖"):
                st.write(generated_response)


Overwriting app.py


# Run Streamlit App
Code to run Streamit app in Colab

In [6]:
!streamlit run /content/app.py &>/content/logs.txt & curl ipv4.icanhazip.com
!npx localtunnel --port 8501


34.83.125.52
[K[?25hnpx: installed 22 in 6.345s
your url is: https://six-roses-kneel.loca.lt
^C


# Old Code