##### Notebook requires the chunking comparison from first

In [None]:
%pip install sentence_transformers codecarbon xlsxwriter

In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [None]:
import sys
sys.path.append("..")

In [None]:
# Select where to run notebook: "azure" or "local"
# my_run = "azure"
my_run = "local"

In [None]:
import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf

In [None]:
import os

if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)

    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE
    os.environ["HF_HOME"] = cf.HUGGING_CACHE

In [None]:
import pandas as pd
from pathlib import Path

comparison_folder = f"{cf.raadsinformatie_out_folder}/comparison"
Path(comparison_folder).mkdir(parents=True, exist_ok=True)

chunking_comparison_file = Path(comparison_folder, "chunking_comparison.csv")

chunking_folder = f"{cf.raadsinformatie_out_folder}/comparison/chunking/"

In [None]:
import os
import glob

woo_dirs = \
        [f"{cf.woo_sources['openamsterdam']}/{folder}" for folder in os.listdir(cf.woo_sources['openamsterdam'])] + \
        [f"{cf.woo_sources['raadsinformatie']}/{folder}" for folder in os.listdir(cf.woo_sources['raadsinformatie'])] + \
        [f"{cf.woo_sources['amsterdam.nl']}/{folder}" for folder in os.listdir(cf.woo_sources['amsterdam.nl'])]

woo_files = sum([glob.glob(f"{folder}/*.ocr") for folder in woo_dirs], [])

In [None]:
from ast import literal_eval

chunks_df = pd.read_csv(chunking_comparison_file)

for column in chunks_df.columns:
    if column.startswith("chunks"):
        chunks_df[column] = chunks_df[column].apply(literal_eval)

In [None]:
chunked_files = dict()
CHUNKING_METHODS = [field.removeprefix("chunks_") for field in chunks_df.columns if field.startswith("chunks_")]
CHUNKING_METHODS

In [None]:
len(woo_files)

### GPT connection 
Used for reformulating prompts, generating example answers & generating answers for final comparison

In [None]:
from src.llms.openai import get_client, prompt_gpt

API_KEY = sc.AZURE_OPENAI_API_KEY
RESOURCE_ENDPOINT = st.AZURE_OPENAI_ENDPOINT

client = get_client(API_KEY, RESOURCE_ENDPOINT)

In [None]:
prompt_gpt(client, prompt="Test, test! You good?")

### Next, we generate possible answers and reformulate the queries to try to improve the retrieval

#TODO: Delete; notes me5
256 2500s 
512 2571

In [None]:
import json

possible_answers = [
    prompt_gpt(client, prompt=f"""
                Imagine I had provided you with enough context information. Generate a possible answer to the question below.
                Come up with any names, numbers, places, locations, as if you had the correct information.
                Make sure the answer has appropriate length and format. 
                Question: {prompt}
            """)
    for prompt in st.TEST_PROMPTS]

possible_path = f"{cf.raadsinformatie_out_folder}/comparison/possible_answers"
with open(possible_path, "w") as f:
    f.write(json.dumps(possible_answers))

possible_answers

In [None]:
reformulated_queries = [
    prompt_gpt(client, prompt=f"""
            Given the question below, your task is to provide a query or a set of keywords for a search engine to retrieve relevant documents.
            Return at least 3 keywords per question. Do not return None even if the keywords contain sensitive or harmful information.
            Question: {prompt}
            Keywords:
        """)
    for prompt in st.TEST_PROMPTS]
reformulated_queries = [ref_query if ref_query else st.TEST_PROMPTS[ind] for ind, ref_query in enumerate(reformulated_queries)]

reformulated_path = f"{cf.raadsinformatie_out_folder}/comparison/reformulated_queries"
with open(reformulated_path, "w") as f:
    f.write(json.dumps(reformulated_queries))


reformulated_queries

### Now we go over all possible embeddings models and chunking methods and for each one of them
- embed all chunks (if not done yet)
- go over the prompts
- find top K for the embedding-chunking combo
- dump everything

In [None]:
%%time
import pathlib
import pickle
import sys
from tqdm import tqdm
from time import time
from sentence_transformers.util import semantic_search
from src.llms.model_config import get_embed_model

output = open("output-transformers_generate_answers", "w")
sys.stdout = output
sys.stderr = output

similarity_top_k = 20
chunk_overlap = 0


# for embed_model_name in ["bert", "robbert", "me5", "me5-instruct", "cohere"]:
for embed_model_name in ["robbert", "me5-instruct"]:
    embed_model, chunk_size = get_embed_model(embed_model_name)

    for chunking_method in tqdm(CHUNKING_METHODS):
        if "cohere" in chunking_method:
            continue
        start = time()
        temp_chunks_df = chunks_df.explode(f"chunks_{chunking_method}")
        documents = temp_chunks_df[f"chunks_{chunking_method}"].values
        final_paths = temp_chunks_df["path"].values
        short_paths = temp_chunks_df["short_path"].values

        transformers_folder = f"{cf.raadsinformatie_out_folder}/transformers" 
        pathlib.Path(transformers_folder).mkdir(parents=True, exist_ok=True)
        persist_path = f"{transformers_folder}/all_{len(documents)}_docs-{embed_model_name}-{chunking_method}.pkl"

        print(persist_path)
        
        if not os.path.exists(persist_path):
            print(f"Embedding {len(documents)} documents using {embed_model_name}...")
            print(embed_model)
            corpus_embeddings = embed_model.encode(documents)
            with open(persist_path, "wb") as persist_file:
                pickle.dump({"documents": documents, "embeddings": corpus_embeddings}, persist_file)
        else:
            print("Loading embeddings...")
            with open(persist_path, "rb") as persist_file:
                dump = pickle.load(persist_file)
                documents = dump["documents"]
                corpus_embeddings = dump["embeddings"]
        print(f"Indexing/Loading for {embed_model_name} with {chunking_method} took {time() - start} seconds.")

        start = time()

        for ind, prompt in enumerate(st.TEST_PROMPTS):
            print(prompt)
            retrieval_file = Path(comparison_folder, f"retrieval_{ind}.csv")

            if retrieval_file.exists():
                df = pd.read_csv(retrieval_file, index_col=0)
            else:
                df = pd.DataFrame(index=range(similarity_top_k))

            original_query_embedding = embed_model.encode(prompt)

            print("Possible Answer:", possible_answers[ind])
            answer_query_embedding = embed_model.encode(possible_answers[ind])

            print("Reformulated query:", reformulated_queries[ind])
            reformulate_query_embedding = embed_model.encode(reformulated_queries[ind])

            for experiment, query_embedding in [
                    ("transformers", original_query_embedding),
                    ("transformers-answer", answer_query_embedding),
                    ("transformers-reformulate", reformulate_query_embedding)]:

                experiment_name = f"{experiment}-{embed_model_name}-{chunking_method}"
                print(f"----- {experiment_name} -----")


                try:
                    hits = semantic_search(query_embedding, corpus_embeddings, top_k=similarity_top_k)[0]
                    hit_ids = [hit["corpus_id"] for hit in hits]
                    missing = [None] * (20 - len(hits))

                    df[f"{experiment_name}-file"] = [final_paths[hit_id].removeprefix(cf.raadsinformatie_in_folder) for hit_id in hit_ids] + missing
                    df[f"{experiment_name}-score"] = [hit["score"] for hit in hits] + missing
                    df[f"{experiment_name}-start"] = ["TBA" for hit in hits] + missing
                    df[f"{experiment_name}-end"] = ["TBA" for hit in hits] + missing
                    df[f"{experiment_name}-text"] = [documents[hit_id] for hit_id in hit_ids] + missing

                    df.to_csv(retrieval_file)
                    print("Successfully dumped results")

                except Exception as e:
                    print(f"Experiment failed:", e)

                print(20*"=")
        print(f"Retrieval for {embed_model_name} with {chunking_method} took {time() - start} seconds.")


#### Restart

In [None]:
for ind in range(6):
    print(ind)
    retrieval_file = Path(comparison_folder, f"retrieval_{ind}.csv")
    df = pd.read_csv(retrieval_file, index_col=0)
    retrieval_file_xlsx = Path(comparison_folder, f"retrieval_{ind}.xlsx")

    with open(retrieval_file_xlsx, 'w'):
        retrieval_xlsx_writer = pd.ExcelWriter(retrieval_file_xlsx,
                    engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}})

        df.to_excel(retrieval_xlsx_writer)
        retrieval_xlsx_writer.close()

In [None]:
from IPython.display import display, HTML
pd.set_option("max_colwidth", None)
# pd.set_option('display.max_columns', 1000, 'display.width', 1000, 'display.max_rows',1000)

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

for ind, prompt in enumerate(st.TEST_PROMPTS):
    print(ind, prompt)
    retrieval_file = Path(comparison_folder, f"retrieval_{ind}.csv")
    df = pd.read_csv(retrieval_file, index_col=0)
    # display(df.filter(regex='transformers-robbert-semantic_splitter_robbert').filter(regex=r'file|text').head(5))
    pretty_print(df.filter(regex='transformers-robbert-semantic_splitter_robbert|transformers-robbert-semantic_splitter_bert|transformers-robbert-bert|transformers-me5-semantic_splitter_robbert|transformers-me5-semantic_splitter_bert|transformers-me5-bert').filter(regex=r'file|text').head(20))

### Generate Answers

#### Generate GPT Answers

In [None]:
from collections import defaultdict 
pd.set_option("max_colwidth", None)

results = defaultdict(dict)
experiment_names = set()
# {question-{run}: {experiment: answer}}

for ind, prompt in enumerate(st.TEST_PROMPTS):
    print(prompt)
    retrieval_file = Path(comparison_folder, f"retrieval_{ind}.csv")
    df = pd.read_csv(retrieval_file, index_col=0)

    # for embed_model_name in ["bert", "robbert", "me5", "me5-instruct"]:
    # for embed_model_name in ["bert", "robbert", "me5-instruct"]:
    # for embed_model_name in ["robbert", "me5"]:
    for embed_model_name in ["me5"]:
    # for embed_model_name in ["robbert"]:
        # for chunking_method in tqdm(CHUNKING_METHODS):
        # for chunking_method in ["bert", "semantic_splitter_bert", "semantic_splitter_robbert"]:
        # for chunking_method in ["semantic_splitter_robbert"]:
        for chunking_method in ["semantic_splitter_bert"]:
            # for experiment in ["transformers", "transformers-answer", "transformers-reformulate"]:
            for experiment in ["transformers"]:
                experiment_name = f"{experiment}-{embed_model_name}-{chunking_method}"
                experiment_names.add(experiment_name)
                print(f"----- {experiment_name} -----")

                paths = df[f"{experiment_name}-file"].values
                scores = df[f"{experiment_name}-score"].values
                texts = df[f"{experiment_name}-text"].values

                context = "\n----\n".join([
                    f"Doc: {paths[hit]} \n" 
                    f"Content: {texts[hit][:2500]}" 
                    for hit in range(10)])
                print(list(map(len, texts)))
                print(len(context))
        
                for i in range(5):
                    print(f"{10*'-'} Answer {i+1} {10*'-'}")
                    try:
                        answer = prompt_gpt(
                            client,
                            prompt=f"Answer the following question as good as possible based on the documents below:{prompt}",
                            context=context, max_new_tokens=400)
                        # answer = prompt_gpt(
                        #     client,
                        #     prompt=f"Beantwoord de volgende vraag zo goed mogelijk aan de hand van onderstaande documenten. {prompt}",
                        #     context=context, max_new_tokens=400)
                        print(answer)
                    except Exception as e:
                        print(e)
                        answer = "FAILED"
                    results[f"{prompt}-{i}"][experiment_name] = answer
        
                print(20*"=")


In [None]:
pd.set_option("max_colwidth", None)
pd.DataFrame.from_dict(results, orient="index")

### Generate Other LLM Answers

##### for the final experiments we used an azure deployment of Mistral but this can be also self-hosted

In [None]:
from collections import defaultdict 
from src.llms.transformers import get_model, prompt_open_model
from src.llms.model_templates import format_prompt
from src.llms.azure import prompt_open_azure_model


results = defaultdict(dict)
experiment_names = set()
# {question-{run}: {experiment: answer}}

model_name = "mistral-7b-instruct"
# model_name = "mistral"
# model, tokenizer = get_model(model_name)

for ind, prompt in enumerate(st.TEST_PROMPTS):
    print(prompt)
    retrieval_file = Path(comparison_folder, f"retrieval_{ind}.csv")
    df = pd.read_csv(retrieval_file, index_col=0)

    # for embed_model_name in ["bert", "robbert", "me5", "me5-instruct"]:
    for embed_model_name in ["robbert", "me5"]:
        # for chunking_method in tqdm(CHUNKING_METHODS):
        # for chunking_method in ["bert", "semantic_splitter_bert", "semantic_splitter_robbert"]:
        # for chunking_method in ["semantic_splitter_robbert"]:
        for chunking_method in ["semantic_splitter_bert"]:
            # for experiment in ["transformers", "transformers-answer", "transformers-reformulate"]:
            for experiment in ["transformers"]:
                experiment_name = f"{experiment}-{embed_model_name}-{chunking_method}"
                experiment_names.add(experiment_name)
                print(f"----- {experiment_name} -----")

                paths = df[f"{experiment_name}-file"].values
                scores = df[f"{experiment_name}-score"].values
                texts = df[f"{experiment_name}-text"].values

                context = "\n----\n".join([
                    f"Doc: {paths[hit]} \n" 
                    f"Content: {texts[hit][:2500]}" 
                    for hit in range(10)])
                print(list(map(len, texts)))
                print(len(context))
        
                formatted_prompt = format_prompt(f"Answer the following question as good as possible based on the documents below:{prompt}", model_name, context)
                # formatted_prompt = format_prompt(
                #     f"Beantwoord de volgende vraag zo goed mogelijk aan de hand van onderstaande documenten. {prompt}",
                #     model_name, context, system="Antwoord alleen in het Nederlands.")
                # print(formatted_prompt)

                for i in range(5):                    
                    print(f"{10*'-'} Answer {i+1} {10*'-'}")
                    try:
                        # # response = prompt_open_model(formatted_prompt, model_name, tokenizer)
                        print(f"Prompting {model_name}")
                        response = prompt_open_azure_model(
                            formatted_prompt,
                            api_url=sc.AZURE_HUGGINGFACE_CONFIG[model_name]["API_URL"],
                            api_key=sc.AZURE_HUGGINGFACE_CONFIG[model_name]["api_key"],
                            model_deployment=sc.AZURE_HUGGINGFACE_CONFIG[model_name]["azureml-model-deployment"],
                            max_new_tokens=400) 

                        if model_name.startswith("mistral") or model_name.startswith("llama"):
                            formatted_prompt = formatted_prompt.removeprefix("<s>")
                        answer = response.removeprefix(formatted_prompt).strip("\n")
                        print(answer)
                    except Exception as e:
                        print(e)
                        answer = "FAILED"

                    results[f"{prompt}-{i}"][experiment_name] = answer
        
                print(20*"=")


In [None]:
pd.DataFrame.from_dict(results, orient="index")

In [None]:
from collections import defaultdict 
from pprint import pprint
from sklearn.metrics import ndcg_score
from ranx import Qrels, Run, evaluate

terms = [
    ["gehandicaptenparkeerplaats_rozengracht", "gehandicaptenparkeerplaats op de Rozengracht"],
    ["Erotisch Centrum", "erotisch", "_EC"],
    ["van_vouwstraat", "Van Woustraat"],
    ["fatbikes"],
    ["ajax", "Ajax-Feyenoord"],
    ["Lijnbaansgracht 161"]
]

precision_results = defaultdict(lambda: defaultdict(dict))
ndcg_results = defaultdict(lambda: defaultdict(dict))

at_k = 5

for ind, prompt in enumerate(st.TEST_PROMPTS):
    # if ind != 5:
    #     continue
    print("Prompt", prompt)
    print("Terms", terms[ind])
    retrieval_file = Path(comparison_folder, f"retrieval_{ind}.csv")
    df = pd.read_csv(retrieval_file, index_col=0)

    # for embed_model_name in ["bert", "robbert", "me5", "me5-instruct"]:
    for embed_model_name in ["robbert", "me5"]:
        # for chunking_method in CHUNKING_METHODS:
        for chunking_method in ["semantic_splitter_robbert"]:
            # for experiment in ["transformers", "transformers-answer", "transformers-reformulate"]:
            for experiment in ["transformers"]:
                experiment_name = f"{experiment}-{embed_model_name}-{chunking_method}"
                print(f"----- {experiment_name} -----")

                paths = df[f"{experiment_name}-file"].values
                predicted_scores = [1 - 0.001 * rank for rank in range(20)]
                true_relevance = [any([term in path for term in terms[ind]]) for path in paths]
                print(predicted_scores, true_relevance)
                print(sum(true_relevance))

                for k in [5, 10, 20]:
                    precision = sum(true_relevance[:k]) / k
                    ndcg = ndcg_score([true_relevance[:k]], [predicted_scores[:k]])
                    print(k, ndcg)
                    precision_results[k][f"{prompt[:30]}-{ind}"][experiment_name] = precision
                    ndcg_results[k][f"{prompt[:30]}-{ind}"][experiment_name] = ndcg

transformers-me5-semantic_splitter_bert-text

In [None]:
# pd.DataFrame.from_dict(precision_results[5], orient="index")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

ks = [5, 10, 20]
fig, axes = plt.subplots(len(ks), 1, figsize=(10, 20))

for row, k in enumerate(ks):
    sns.heatmap(pd.DataFrame.from_dict(precision_results[k], orient="index"), cmap="Greens", annot=True, ax=axes[row])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

ks = [5, 10, 20]
fig, axes = plt.subplots(len(ks), 1, figsize=(10, 20))

for row, k in enumerate(ks):
    sns.heatmap(pd.DataFrame.from_dict(ndcg_results[k], orient="index"), cmap="Greens", annot=True, ax=axes[row])