In [None]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.retrievers.bm25 import BM25Retriever

import pandas as pd
from tqdm import tqdm

In [None]:
docs = SimpleDirectoryReader(input_files=["carotis.pdf", "schlaganfall.pdf"]).load_data()

In [None]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)

nodes = splitter.get_nodes_from_documents(docs)

print(f"Created {len(nodes)} nodes from the documents.")

In [None]:
carotis_df = pd.read_csv('xxx.csv')
stroke_df = pd.read_csv('xxx.csv')

carotis_df["leitlinie"] = "carotis"
stroke_df["leitlinie"] = "schlaganfall"

carotis_df["id"] = "c_" + carotis_df["id"].astype(str)
stroke_df["id"] = "s_" + stroke_df["id"].astype(str)

carotis_df["page"] = "c_" + carotis_df["page"].astype(str)
stroke_df["page"] = "s_" + stroke_df["page"].astype(str)

total_eval_df = pd.concat([carotis_df, stroke_df])
total_eval_df.head()

In [None]:
len(total_eval_df)

In [None]:
def test_retrieval(retriever):
    leitlinien_hits = 0
    page_hits = 0
    total = len(total_eval_df)

    for index, row in tqdm(total_eval_df.iterrows(), total=total, desc="Processing questions"):
        question = row['example_questions']
        
        try:
            retrieved_nodes = retriever.retrieve(question)
        
            page_found = any(node.metadata.get('page_label') == row["page"].split("_")[1] for node in retrieved_nodes)
            leitlinie_found = any(node.metadata.get('file_name') == f"{row['leitlinie']}_word.pdf" for node in retrieved_nodes)
        
            if leitlinie_found: 
                leitlinien_hits += 1
            
            if page_found:
                page_hits += 1
        except Exception:
            print(row)
            break

    leitlinien_hit_rate = (leitlinien_hits / total) * 100

    page_hit_rate = (page_hits / total) * 100

    print(f"Total Page hits: {page_hits}")
    print(f"Page Hit rate: {page_hit_rate:.2f}%")
    print(f"Total Leitlinien hits: {leitlinien_hits}")
    print(f"Leitlinien Hit rate: {leitlinien_hit_rate:.2f}%")
    
    return {
        "page_hits": page_hits,
        "page_hit_rate": page_hit_rate,
        "leitlinien_hits": leitlinien_hits,
        "leitlinien_hit_rate": leitlinien_hit_rate
    }


## Vanilla BM25

In [None]:
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
)

In [None]:
bm25_results = test_retrieval(bm25_retriever)

## OpenAI Embeddings (text-embedding-3-small)

In [None]:
import os 

from dotenv import load_dotenv

load_dotenv()

from llama_index.embeddings.openai import OpenAIEmbedding

In [None]:
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [None]:
openai_small_index = VectorStoreIndex(nodes=nodes, show_progress=True)

In [None]:
openai_retriever = openai_small_index.as_retriever(similarity_top_k=5)

In [None]:
openai_small_results = test_retrieval(openai_retriever)

In [None]:
openai_small_index.storage_context.persist(persist_dir="data/internal/openai_small_index")

## OpenAI large

In [None]:
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")

In [None]:
openai_large_index = VectorStoreIndex(nodes=nodes, show_progress=True)

In [None]:
openai_large_retriever = openai_large_index.as_retriever(similarity_top_k=5)

In [None]:
open_ai_large_results = test_retrieval(openai_large_retriever)

In [None]:
openai_large_index.storage_context.persist(persist_dir="data/internal/openai_large_index")

## Sentence Transformers (sentence-transformers/distiluse-base-multilingual-cased-v1)

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.embed_model = HuggingFaceEmbedding("sentence-transformers/distiluse-base-multilingual-cased-v1")

In [None]:
hf1_index = VectorStoreIndex(nodes=nodes, show_progress=True)

In [None]:
hf1_retriever = hf1_index.as_retriever(similarity_top_k=5)

In [None]:
hf1_results = test_retrieval(hf1_retriever)

In [None]:
hf1_index.storage_context.persist('data/internal/hf1')

## Sentence Transformers ("BAAI/bge-m3")

In [None]:
Settings.embed_model = HuggingFaceEmbedding("BAAI/bge-m3")

In [None]:
hf2_index = VectorStoreIndex(nodes=nodes, show_progress=True)

In [None]:
hf2_retriever = hf2_index.as_retriever(similarity_top_k=5)

In [None]:
h2_results = test_retrieval(hf2_retriever)

In [None]:
hf2_index.storage_context.persist('data/internal/hf2')

In [None]:
retrieval_results = pd.DataFrame({
    "BM25": bm25_results,
    "OpenAI Small": openai_small_results,
    "OpenAI Large": open_ai_large_results,
    "HF1": hf1_results,
    "HF2": h2_results
})

In [None]:
retrieval_results = retrieval_results.transpose()

retrieval_results

In [None]:
import pickle

In [None]:
with open('retrieval_resuts.pickle', 'wb') as handle:
    pickle.dump(retrieval_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('retrieval_resuts.pickle', 'rb') as handle:
    retrieval_results = pickle.load(handle)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

## CI from proportions (Wilson Method)

In [None]:
from statsmodels.stats.proportion import proportion_confint

n = 384  

ci_lower, ci_upper = proportion_confint(count=(retrieval_results.page_hits).astype(int), 
                                      nobs=[n] * len(retrieval_results), 
                                      alpha=0.05,  # 95% confidence interval
                                      method='wilson')

ci_lower = ci_lower * 100
ci_upper = ci_upper * 100

ci_upper, ci_lower

In [None]:
retrieval_results["model"] = retrieval_results.index.values
retrieval_results

In [None]:
from matplotlib.container import ErrorbarContainer

errors = pd.DataFrame({
    'lower': retrieval_results['page_hit_rate'] - ci_lower,
    'upper': ci_upper - retrieval_results['page_hit_rate']
}).T.values

with sns.axes_style("white"):
    plt.figure(figsize=(6, 4))
    plt.xticks(rotation=45, ha='right')
    
    ax = sns.barplot(data=retrieval_results, x="model", y="page_hit_rate", capsize=1, 
                    err_kws={'linewidth': 1}, color="darkcyan")
    
    ax.errorbar(x=range(len(retrieval_results)), y=retrieval_results['page_hit_rate'],
               yerr=errors, fmt='none', color='black', 
               capsize=2, linewidth=0.8)

    for container in ax.containers:
        if not isinstance(container, ErrorbarContainer):
            ax.bar_label(container, fmt=lambda x: f'{x:.1f}%', padding=15)

    ax.set_ylim(0,100)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    ax.set_xlabel("Model")
    ax.set_ylabel("Page Hit Rate (%)")


In [None]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
from statsmodels.stats.multitest import multipletests

total_samples = 384
models = {model: retrieval_results.loc[model, "page_hits"].astype(int) for model in retrieval_results.model.values}

print("Model Performance Statistics:")
for model, hits in models.items():
    accuracy = (hits / total_samples) * 100
    print(f"{model}: {hits}/{total_samples} ({accuracy:.2f}%)")

print("\nPairwise Statistical Comparisons:")
model_names = list(models.keys())
p_values = []
comparisons = []
z_scores = []

for i in range(len(model_names)):
    for j in range(i + 1, len(model_names)):
        model1, model2 = model_names[i], model_names[j]
        count = np.array([models[model1], models[model2]])
        nobs = np.array([total_samples, total_samples])
        
        z_stat, p_value = proportions_ztest(count, nobs)
        
        comparisons.append(f"{model1} vs {model2}")
        z_scores.append(z_stat)
        p_values.append(p_value)

rejected, p_values_corrected, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

for comp, z, p, p_corr, rej in zip(comparisons, z_scores, p_values, p_values_corrected, rejected):
    print(f"\n{comp}:")
    print(f"Z-score: {z:.3f}")
    print(f"Uncorrected P-value: {p:.4f}")
    print(f"Bonferroni-corrected P-value: {p_corr:.4f}")
    print(f"Significant at α=0.05 (after Bonferroni correction): {'YES' if rej else 'NO'}")

print("\nConfidence Intervals (95%):")
for model, hits in models.items():
    ci_lower, ci_upper = proportion_confint(
        count=hits, 
        nobs=total_samples,
        alpha=0.05,  # 95% confidence interval
        method='wilson'  # Wilson method is more accurate than normal approximation
    )
    accuracy = (hits / total_samples) * 100
    ci_lower_pct = ci_lower * 100
    ci_upper_pct = ci_upper * 100
    print(f"{model}: {accuracy:.2f}% [{ci_lower_pct:.2f}%, {ci_upper_pct:.2f}%]")

results_df = pd.DataFrame({
    'Hits': models.values(),
    'Total': total_samples,
    'Accuracy': [hits/total_samples * 100 for hits in models.values()]
}, index=models.keys())

ci_lower = []
ci_upper = []
for hits in models.values():
    lower, upper = proportion_confint(
        count=hits,
        nobs=total_samples,
        alpha=0.05,
        method='wilson'
    )
    ci_lower.append(lower * 100)
    ci_upper.append(upper * 100)

results_df['CI_Lower'] = ci_lower
results_df['CI_Upper'] = ci_upper

print("\nSummary DataFrame:")
print(results_df)

In [None]:
import numpy as np
from scipy import stats

# Create binary arrays for each model's performance
bm25 = np.concatenate([np.ones(315), np.zeros(384 - 315)])
openai_small = np.concatenate([np.ones(269), np.zeros(384 - 269)])
openai_large = np.concatenate([np.ones(287), np.zeros(384 - 287)])
hf1 = np.concatenate([np.ones(131), np.zeros(384 - 131)])
hf2 = np.concatenate([np.ones(301), np.zeros(384 - 301)])

# Perform Kruskal-Wallis test
h_stat, p_value = stats.kruskal(bm25, openai_small, openai_large, hf1, hf2)

print(f"H-statistic: {h_stat}\nP-value: {p_value}")