<a href="https://colab.research.google.com/github/imkiding/OpenROAD/blob/master/SemiGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Installation

In [None]:
# Install Streamlit
!pip install streamlit

# Download Ngrok
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

In [None]:
!pip install openai langchain streamlit chromadb==0.3.29 unstructured[pdf] tiktoken

# App Definition


In [12]:
%%writefile app.py

import os
import streamlit as st
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader

# Variables for model selection and setting
OPENAI_API_KEY='sk-' # @param {type:"string"}

Model_Name='gpt-3.5-turbo' # @param ["gpt-3.5-turbo", "2nd option", "3rd option"]

Max_Tokens=1000 # @param {type:"integer"}

Temperature = 0 # @param {type:"integer"}

class UserInterface:
  def get_ui_title(self):
    # Chat UI title
    st.header("Upload Your Files and Ask Questions")
    st.subheader(':open_file_folder: Upload (PDF, DOCX, TXT) Files from the Sidebar :point_left:')

  def get_openai_key_in_sidebar(self):
    # File uploader in the sidebar on the left
    with st.sidebar:
      openai_api_key = st.text_input("OpenAI API Key", type="password")
      # Set OPENAI_API_KEY as an environment variable
      os.environ["OPENAI_API_KEY"] = openai_api_key
    if not openai_api_key:
      st.info("Please add your OpenAI API key to continue.")
      st.stop()

  def update_docs_in_sidebar(self):
    with st.sidebar:
      uploaded_files = st.file_uploader("Please upload your files", accept_multiple_files=True, type=None)
      st.info("Please refresh the browser if you decided to upload more files to reset the session", icon="🚨")
    return uploaded_files

  def display_assistant_result(self, result):
    # Display assistant response in chat message container
    with st.chat_message("assistant"):
      message_placeholder = st.empty()
      full_response = ""
      full_response = result["answer"]
      message_placeholder.markdown(full_response + "|")
    message_placeholder.markdown(full_response)
    print(full_response)

    # Storing this chat history which is crucial for maintaining the context of the conversation.
    # It can be used for subsequent questions with the user.
    st.session_state.messages.append({"role": "assistant", "content": full_response})

class OpenAIModel:

  def create_openai_retrieval_chain(self, document_chunks):
    if "processed_data" not in st.session_state:
      embeddings = OpenAIEmbeddings()
      vectorstore = Chroma.from_documents(document_chunks, embeddings)
      # Store the processed data in session state for reuse
      st.session_state.processed_data = {
        "document_chunks": document_chunks,
        "vectorstore": vectorstore,
      }
    else:
      # If the processed data is already available, retrieve it from session state
      document_chunks = st.session_state.processed_data["document_chunks"]
      vectorstore = st.session_state.processed_data["vectorstore"]

    # Initialize Langchain's QA Chain with the vectorstore
    llm = ChatOpenAI(temperature=Temperature,max_tokens=Max_Tokens, model_name=Model_Name,streaming=True)
    retrieval_chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever())
    return retrieval_chain

  #def create_rag_chain(self):



class UserQuestion:

  def __init__(self, question):
      self.question = question

  def create_prompt(self):
    # Storing this chat history which is crucial for maintaining the context of the conversation.
    # It can be used for subsequent questions with the user.
    st.session_state.messages.append({"role": "user", "content": self.question})
    with st.chat_message("user"):
      st.markdown(self.question)

    # Query the assistant using the latest chat history
    # Iterates over each message in st.session_state.messages and extracts the "role" and "content" values from each message.
    # For example, if st.session_state.messages contains:
    # [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there"}],
    # then the resulting list would be [("user", "Hello"), ("assistant", "Hi there")].
    prompt = {"question": self.question, "chat_history": [(message["role"], message["content"]) for message in st.session_state.messages]}

    return prompt


class DataProcessor:

  def create_document_chuncks(self, uploaded_files):
    # Print the number of files to console
    print(f"Number of files uploaded: {len(uploaded_files)}")

    # Load the data and perform preprocessing only if it hasn't been loaded before
    # It indicates that the data has not been processed or loaded yet.
    # Load the data from uploaded PDF files
    documents = []
    for uploaded_file in uploaded_files:
      # Get the full file path of the uploaded file
      file_path = os.path.join(os.getcwd(), uploaded_file.name)

      # Save the uploaded file to disk
      with open(file_path, "wb") as f:
        f.write(uploaded_file.getvalue())

      # Use UnstructuredFileLoader to load the PDF file
      loader = UnstructuredFileLoader(file_path)
      loaded_documents = loader.load()
      print(f"Number of files loaded: {len(loaded_documents)}")

      # Extend the main documents list with the loaded documents
      documents.extend(loaded_documents)

    # Chunk the data, create embeddings, and save in vectorstore
    text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    return text_splitter.split_documents(documents)

def main():
  user_interface = UserInterface()
  user_interface.get_ui_title()
  user_interface.get_openai_key_in_sidebar()
  # Initialize chat history
  if "messages" not in st.session_state:
    st.session_state.messages = []

  st.write("Upload Documents")
  uploaded_files = user_interface.update_docs_in_sidebar()
  st.write("Upload Documents Done")

  # Check if files are uploaded
  if uploaded_files:
    # Create an langchain vectorstore
    data_professor = DataProcessor()
    model = OpenAIModel()
    document_chunks = data_professor.create_document_chuncks(uploaded_files)
    retrieval_chain = model.create_openai_retrieval_chain(document_chunks)

    # Accept and answer user questions
    if question:=st.chat_input("Your questions?"):
      user_question = UserQuestion(question)
      prompt = user_question.create_prompt()

      # Display chat messages from history on app rerun
      for message in st.session_state.messages:
        with st.chat_message(message["role"]):
          st.markdown(message["content"])
      # Display the prompt for debugging
      # st.write("Debugging - Prompt:", prompt)

      result = retrieval_chain(prompt)

      # Display the anwsers for debugging
      # st.write("Debugging - Result:", result)
      user_interface.display_assistant_result(result)


  else:
    st.write("Please upload your files.")


if __name__ == "__main__":
    main()

Overwriting app.py


In [8]:
!streamlit run app.py &>  /dev/null&
!./ngrok authtoken 2Xzjj6oNCj99MWDAv702AEjeSnl_3zdJ2XVw5rGWbjTb3yuQ9
!./ngrok http 8501 &> /dev/null&

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml
