In [None]:
# %pip install -r ../requirements-open-llms.txt

In [None]:
# %pip install llama-index llama-index-llms-azure-openai llama-index-embeddings-azure-openai llama-index-llms-huggingface llama-index-embeddings-huggingface

In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [None]:
import sys
sys.path.append("..")

In [None]:
# Select where to run notebook: "azure" or "local"
my_run = "azure"

In [None]:
import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf

In [None]:
import os

if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)

    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

In [None]:
import pandas as pd
from pathlib import Path

comparison_folder = f"{cf.raadsinformatie_out_folder}/comparison"
Path(comparison_folder).mkdir(parents=True, exist_ok=True)

In [None]:
import os
import glob

woo_dirs = \
        [f"{cf.woo_sources['openamsterdam']}/{folder}" for folder in os.listdir(cf.woo_sources['openamsterdam'])] + \
        [f"{cf.woo_sources['raadsinformatie']}/{folder}" for folder in os.listdir(cf.woo_sources['raadsinformatie'])] + \
        [f"{cf.woo_sources['amsterdam.nl']}/{folder}" for folder in os.listdir(cf.woo_sources['amsterdam.nl'])]

woo_files = sum([glob.glob(f"{folder}/*.ocr") for folder in woo_dirs], [])

In [None]:
len(woo_files)

### Set up llm and embed model

In [None]:
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.text_splitter import TokenTextSplitter
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext, load_index_from_storage
from llama_index.core import PromptTemplate
import tiktoken
from transformers import AutoModel, AutoTokenizer
import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.WARNING,
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


API_KEY = sc.AZURE_OPENAI_API_KEY
RESOURCE_ENDPOINT = st.AZURE_OPENAI_ENDPOINT

llama_query_wrapper = PromptTemplate("<s>[INST] {query_str} [/INST] </s>\n")

default_llm_params = {
    "do_sample": True,
    "temperature": 0.6,
    "top_k": 25,
    "top_p": 0.65
}

max_new_tokens = 256

def get_llm(model_name):
    if model_name == "gpt":
        model = AzureOpenAI(
            # defaults to os.environ.get("OPENAI_API_KEY")
            # engine="LC-csv",
            model="gpt-35-turbo",
            deployment_name ="gpt-35-turbo",
            api_key=API_KEY,
            api_version="2023-05-15",
            azure_endpoint=RESOURCE_ENDPOINT,
            max_new_tokens=max_new_tokens,
            **default_llm_params
        )
        context_window = 8192

    elif model_name == "mistral":   
        model = HuggingFaceLLM(
            model_name="mistralai/Mistral-7B-Instruct-v0.1",
            tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
            query_wrapper_prompt=llama_query_wrapper,
            context_window=3900,
            max_new_tokens=max_new_tokens,
            generate_kwargs=default_llm_params,
            device_map="auto",
        )
        context_window = 3900

    elif model_name == "llama":
        model = HuggingFaceLLM(
            model_name="meta-llama/Llama-2-13b-chat-hf",
            tokenizer_name="meta-llama/Llama-2-13b-chat-hf",
            query_wrapper_prompt=llama_query_wrapper,
            context_window=3900,
            max_new_tokens=max_new_tokens,
            generate_kwargs=default_llm_params,
            device_map="auto",
        )
        context_window = 3900

    else:
        raise ValueError(f"Unknown model {model_name}. Known models: gpt, mistral, llama")

    return model, context_window

def get_embed_model(model_name):
    if model_name == "ada":
        model = AzureOpenAIEmbedding(
            model="text-embedding-ada-002",
            # engine="text-embedding-ada-002",
            deployment_name="text-embedding-ada-002",
            api_key=API_KEY,
            azure_endpoint=RESOURCE_ENDPOINT,
            api_version="2023-05-15",
        )
        # chunk_size = 8191
        # chunk_size = 1024 # -> way too slow
        # chunk_size = 4096
        # tokenizer = tiktoken.get_encoding("cl100k_base").encode
        chunk_size = 512

    elif model_name == "bert":
        model_id = "jegormeister/bert-base-dutch-cased-snli"
        model = HuggingFaceEmbedding(model_id)
        # tokenizer = AutoTokenizer.from_pretrained(model_id).encode
        chunk_size = 512
        

    elif model_name == "robbert":
        model_id = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
        model = HuggingFaceEmbedding(model_id)
        # tokenizer = AutoTokenizer.from_pretrained(model_id).encode
        chunk_size = 128

    elif model_name == "cohere":
        model_id = "Cohere/Cohere-embed-multilingual-v3.0"
        model = HuggingFaceEmbedding(model_id)
        # tokenizer = AutoTokenizer.from_pretrained(model_id).encode
        chunk_size = 512

    elif model_name == "me5":
        model_id = "intfloat/multilingual-e5-large-instruct"
        automodel = AutoModel.from_pretrained(model_id)
        autotokenizer = AutoTokenizer.from_pretrained(model_id)
        model = HuggingFaceEmbedding(
            model_name=model_id,
            tokenizer_name=model_id,
            model=automodel,
            tokenizer=autotokenizer,
            max_length=512,
        )
        # tokenizer = tokenizer_model.encode
        chunk_size = 512

    else:
        raise ValueError(f"Unknown model {model_name}. Known models: ada, bert, robbert, cohere, me5")

    # tokenizer = tiktoken.get_encoding("cl100k_base").encode

    # return model, tokenizer, chunk_size
    return model, chunk_size

llm_model = "gpt"
embed_model = "ada"

Settings.llm, Settings.context_window = get_llm(llm_model)
# Settings.embed_model, Settings.tokenizer, Settings.chunk_size = get_embed_model(embed_model)
Settings.embed_model, Settings.chunk_size = get_embed_model(embed_model)
Settings.chunk_overlap = 25

# Settings.text_splitter = TokenTextSplitter(
#     chunk_size=Settings.chunk_size,
#     tokenizer=Settings.tokenizer,
# )

In [None]:
try:
    print(Settings.llm.model)
except Exception as e:
    print(Settings.llm.model_name)

print(Settings.embed_model)

### Load / Create Index

In [None]:
# Can move with loading, preserving outside to manually inspect documents
documents = SimpleDirectoryReader(input_files=woo_files).load_data()

In [None]:
%%time
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)

import pathlib

llama_index_folder = f"{cf.raadsinformatie_out_folder}/llama-indices"
pathlib.Path(llama_index_folder).mkdir(parents=True, exist_ok=True)

index_persist_path = f"{cf.raadsinformatie_out_folder}/llama-indices/all_docs-{embed_model}-{Settings.chunk_size}-{Settings.chunk_overlap}"


if not os.path.exists(index_persist_path):
    # documents = SimpleDirectoryReader("./data/some_data_path").load_data()
    print("Creating index...")
    index = VectorStoreIndex.from_documents(documents, show_progress=True)
    index.storage_context.persist(persist_dir=index_persist_path)
else:
    print("Loading index...")
    index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=index_persist_path),
    )

In [None]:
query_engine = index.as_query_engine(similarity_top_k=10)
retriever = index.as_retriever(verbose=True, similarity_top_k=10)

### Check and adjust prompts
https://github.com/run-llama/llama_index/blob/main/docs/examples/prompts/prompts_rag.ipynb

In [None]:
from IPython.display import Markdown, display

def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [None]:
display_prompt_dict(query_engine.get_prompts())

### Check Prompts

In [None]:
def check_prompt(query_engine, prompt, retriever=None):
    print(prompt)

    if retriever:
        print("===== Retrieved =====")
        retrieved = retriever.retrieve(prompt)
        print(10*"-")
        print("\n".join([node.metadata["file_path"] for node in retrieved]))

    try:
        response = query_engine.query(prompt)
        print("----- Source Nodes -----")
        print("\n".join([node.metadata["file_path"] for node in response.source_nodes]))
    except Exception as e:
        print(e)

    for i in range(5):
        print(f"{10*'-'} Answer {i+1} {10*'-'}")
        try:
            answer = query_engine.query(prompt)
            print(answer.response)
        except Exception as e:
            print(e)
    


In [None]:
def check_prompt_simple(query_engine, prompt):
    print(prompt)
    answer = query_engine.query(prompt)
    print("----- Source Nodes -----")
    print("\n".join([node.metadata["file_path"] for node in answer.source_nodes]))
    print(answer.response)

In [None]:
for prompt in st.TEST_PROMPTS:
    check_prompt(query_engine, prompt)
    # check_prompt_simple(query_engine, prompt)
    print(20*"=")

In [None]:
for prompt in st.CONFUSION_PROMPTS:
    check_prompt(query_engine, prompt)

### Streaming

In [None]:
streaming_query_engine = index.as_query_engine(streaming=True)
streaming_response = streaming_query_engine.query(st.TEST_PROMPTS[-1])
streaming_response.print_response_stream()

### Citation

In [None]:
from llama_index.core.query_engine import CitationQueryEngine

citation_query_engine = CitationQueryEngine.from_args(
    index,
    similarity_top_k=10,
    citation_chunk_size=Settings.chunk_size,
)

In [None]:
display_prompt_dict(citation_query_engine.get_prompts())

In [None]:
for prompt in st.TEST_PROMPTS:
    check_prompt(citation_query_engine, prompt)
    print(20*"=")


### Save retrieved per prompt

In [None]:
similarity_top_k = 20
retriever_20 = index.as_retriever(verbose=True, similarity_top_k=similarity_top_k, search_kwargs={"score_threshold": 0.5})

In [None]:
experiment_name = f"llama-index-{embed_model}-{Settings.chunk_size}-{Settings.chunk_overlap}"

for ind, prompt in enumerate(st.TEST_PROMPTS):
    retrieval_file = Path(comparison_folder, f"retrieval_{ind}.csv")

    if retrieval_file.exists():
        df = pd.read_csv(retrieval_file, index_col=0)
    else:
        df = pd.DataFrame(index=range(similarity_top_k))

    retrieved = retriever_20.retrieve(prompt)
    missing = [None] * (20 - len(retrieved))

    df[f"{experiment_name}-file"] = [node.metadata["file_path"].removeprefix(cf.raadsinformatie_in_folder) for node in retrieved] + missing
    df[f"{experiment_name}-score"] = [node.score for node in retrieved] + missing
    df[f"{experiment_name}-start"] = [node.node.start_char_idx for node in retrieved] + missing
    df[f"{experiment_name}-end"] = [node.node.end_char_idx for node in retrieved] + missing
    df[f"{experiment_name}-text"] = [node.text for node in retrieved] + missing

    df.to_csv(retrieval_file)
    

In [None]:
df