In [4]:
import streamlit as st
import os
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)
from pathlib import Path
import tempfile

# Constants
os.environ["OPENAI_API_KEY"] = "sk-proj-ZqOnlW0jOBq1qJn10biUT3BlbkFJVTQEahfEqJEoCNOY1TMx"
PERSIST_DIR = "data/storage"
PRE_EXISTING_DOC_DIR = "data/storage/GDPR_EN.docx"

# Load the pre-existing document and create the initial index if necessary
def initialize_index():
    if not os.path.exists(PERSIST_DIR):
        st.write("Initializing storage and indexing pre-existing document...")
        documents = SimpleDirectoryReader(PRE_EXISTING_DOC_DIR).load_data()
        index = VectorStoreIndex.from_documents(documents)
        # Store the index for later use
        index.storage_context.persist(persist_dir=PERSIST_DIR)
        st.write("Index initialized and persisted.")
    else:
        st.write("Loading existing index from storage...")
        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
        index = load_index_from_storage(storage_context)
        st.write("Index loaded.")
    return index

# Function to add a new document to the index
def add_document_to_index(index, document_path):
    st.write(f"Adding document {document_path} to the index...")
    documents = SimpleDirectoryReader(document_path).load_data()
    index.add_documents(documents)
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    st.write("Document added and index updated.")

# Function to create a prompt template
def create_prompt_template(question, document1_summary, document2_summary):
    template = (
        f"Document 1 Summary: {document1_summary}\n\n"
        f"Document 2 Summary: {document2_summary}\n\n"
        f"Question: {question}\n"
        "Suggest if the policy from Document 2 is within compliance of Document 1."
    )
    return template

# Streamlit app
def main():
    st.title("Dynamic Document Indexing with Streamlit")

    # Initialize or load the index
    index = initialize_index()

    # File uploader for new document
    uploaded_file = st.file_uploader("Upload a new document", type=["txt", "pdf", "docx"])
    
    if uploaded_file is not None:
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_file.write(uploaded_file.read())
            temp_file_path = Path(temp_file.name)
        
        # Add the uploaded document to the index
        add_document_to_index(index, temp_file_path)

        # Clean up temporary file
        temp_file_path.unlink()

    # Create a query engine
    query_engine = index.as_query_engine()

    # Text input for the user's question
    question = st.text_input("Enter your question:")

    if question and uploaded_file:
        # Generate summaries for the documents (for demonstration, we'll use a simple method)
        # In a real application, you would extract or generate summaries dynamically
        document1_summary = "Summary of Document 1 (pre-existing document)"
        document2_summary = "Summary of Document 2 (uploaded document)"

        # Create a prompt using the template
        prompt = create_prompt_template(question, document1_summary, document2_summary)

        # Get the response from the query engine
        response = query_engine.query(prompt)
        
        # Display the question and response
        st.write(f"Prompt: {prompt}")
        st.write(f"Response: {response}")

if __name__ == "__main__":
    main()


2024-07-01 18:52:29.373 Session state does not function when running a script without `streamlit run`


SyntaxError: invalid syntax (4284796380.py, line 1)