In [None]:
# %pip install -r ../requirements-open-llms.txt

In [None]:
# %pip install langchain langchain_openai

In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [None]:
import sys
sys.path.append("..")

In [None]:
# Select where to run notebook: "azure" or "local"
# my_run = "azure"
my_run = "local"

In [None]:
import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf

In [None]:
import os

if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)

    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

In [None]:
import pandas as pd
from pathlib import Path

comparison_folder = f"{cf.raadsinformatie_out_folder}/comparison"
Path(comparison_folder).mkdir(parents=True, exist_ok=True)

In [None]:
import os
import glob

woo_dirs = \
        [f"{cf.woo_sources['openamsterdam']}/{folder}" for folder in os.listdir(cf.woo_sources['openamsterdam'])] + \
        [f"{cf.woo_sources['raadsinformatie']}/{folder}" for folder in os.listdir(cf.woo_sources['raadsinformatie'])] + \
        [f"{cf.woo_sources['amsterdam.nl']}/{folder}" for folder in os.listdir(cf.woo_sources['amsterdam.nl'])]

woo_files = sum([glob.glob(f"{folder}/*.ocr") for folder in woo_dirs], [])

In [None]:
len(woo_files)

### Set up llm and embed model

In [None]:
from langchain.prompts import ChatPromptTemplate
# from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnableMap
from langchain.schema.output_parser import StrOutputParser
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader


docs = []

for file in woo_files:
    loader = TextLoader(file)
    docs += loader.load()

# loader = DirectoryLoader(test_folder, glob="*.txt", show_progress=True)
# docs = loader.load()
print(len(docs)) 

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = splitter.split_documents(docs)

print(documents[0])

In [None]:
simple_template = """
QUESTION: {question}

YOUR ANSWER:"""

simple_prompt = ChatPromptTemplate.from_messages([("system", simple_template)])

In [None]:
# llm = ChatOpenAI(
#     openai_api_key=AZURE_OPENAI_API_KEY,
#     temperature=0.3,
#     model='gpt-3.5-turbo')

llm = AzureChatOpenAI(
    openai_api_key=sc.AZURE_OPENAI_API_KEY,
    azure_endpoint=st.AZURE_OPENAI_ENDPOINT,
    api_version="2023-05-15",
    temperature=0.6,
    model='gpt-35-turbo')

inputs = RunnableMap({
'question': lambda x: x['question']
})

simple_chain = inputs | simple_prompt | llm | StrOutputParser()
simple_chain.invoke({"question": "Geef mij een overzicht van alle meldingen over Café Triple aan Lijnbaansgracht 161."})

In [None]:
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS

# db = FAISS.from_documents(
db = Chroma.from_documents(
    documents, 
    # OpenAIEmbeddings()
    embedding=AzureOpenAIEmbeddings(
        model="text-embedding-ada-002",
        # deployment_name="text-embedding-ada-002",
        api_key=sc.AZURE_OPENAI_API_KEY,
        azure_endpoint=st.AZURE_OPENAI_ENDPOINT,
        api_version="2023-05-15",
    ),    
)

# Get the retriever for the Chat Model
retriever = db.as_retriever(
    search_kwargs={"k": 5}
)

In [None]:
# Create the prompt template
rag_template = """

CONTEXT:
{context}

QUESTION: {question}

YOUR ANSWER:"""

rag_prompt = ChatPromptTemplate.from_messages([("system", rag_template)])

In [None]:
# Define the chain
inputs = RunnableMap({
  'context': lambda x: retriever.get_relevant_documents(x['question']),
  'question': lambda x: x['question']
})
rag_chain = inputs | rag_prompt | llm | StrOutputParser()

# Call the chain with the question

for prompt in st.TEST_PROMPTS:
    print(rag_chain.invoke({"question": prompt}))
    print(20*"=")