# Tutorial Retrieval Recency Bias



#### Imports

In [None]:
import pandas as pd
import os
import re
from dotenv import load_dotenv
from unbiasai.config import DATA_DIR
from unbiasai.utils import initialize_llm, generate_embeddings, insert_documents, retrieve, get_documents_from_supabase, convert_to_doc_objects, create_reranking_prompt, perform_llm_reranking, format_results, extract_created_datetime
from supabase import create_client, Client
from unbiasai.connection import create_supabase_client
from dtsc_queries.retrieval_recency import test_queries
from langchain.schema import SystemMessage, HumanMessage
from datetime import datetime

from unbiasai.config import ENVFILE
load_dotenv(ENVFILE)

## 1. Load data
Set the path and read your CSV data.

In [None]:
file_path = DATA_DIR / 'dataset_retrieval_recency.csv'
df = pd.read_csv(file_path)

Optional: Create subset of data for test

In [None]:
# df = df.head(15)

Define and initialize LLMs and read your own API Keys

In [None]:
models = ["gpt", "claude", "mistral", "cohere", "deepseek"]
models = ["gpt", "mistral", "cohere", "deepseek"]
initialized_models = {}

for model_name in models:
    initialized_models[model_name] = initialize_llm(model_name)

## 2. Connect to Supabase and create a Vector Store
Connect to Supabase

In [None]:
url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase_client: Client = create_client(url, key)

!!! If embeddings are already stored in Supabase, skip until 3.

Apply the generate_embedding function to your data.

In [None]:
df['embedding'] = df['content'].apply(generate_embeddings)

Insert your new df including the embeddings in the supabase table to create a vector store.

In [None]:
insert_documents(df, supabase_client)
# IMPORTANT: change function so supabase table name can be changed.

# 3. Define your Test Queries

In [None]:
test_queries = [
    "What is test query 1?",
    "What is test query 2?",
    "What is test query 3?",
]

In [None]:
test_queries = [
    "How can I connect to Outlook Web?", # 3985
    "How can I access my Officient Calendar?", # 3986
    "Bannière?", # 3987
    "What are the cafeteria plan benefits?", # 3989
    "What about my car configuration offer?", # 3990
    "How do I create a Canva?", # 3991
    "What about chargemap business?", # 3992
    "What is the Moodboard?", # 3993
    "What about Chargemap (domicile)?", # 3994
    "What about Connecting Expertise?", # 3995
    "What's the BeCentral address?", # 3996
    "What about a Microsoft 365 license?", # 3997
    "What about Google Calendar", # 3998
    "How to modify a page on dtsc.be?", # 3999
    "What are compensatory rest days?", # 4000
    "How do I access the shared library?", # 4001
    "What is the login for StaffIT?", # 4003
    "How can I export contacts from Odoo?", # 4005
    "How can I export leads from Odoo?", # 4006
    "What is the structure for OneDrive?", # 4007
    "Who is responsible in case of a traffic fine?", # 4008
    "What about dtsc.be performance?", # 4010
    "What about mailing lists?", # 4011
    "What about a green card?", # 4014
    "Where is the Internship Agreement?", # 4015
    "What about the company credit card?", # 4016
    "How to create a teams meeting from Google Agenda?", # 4017
    "What about Supplementary Family Allowances?", # 4018
    "On what days does the company post on LinkedIn?", # 4019
    "What activities are included in the DTeam Spirit Challenge?", # 4020
    "What are the limits for the mobility budget?", # 4021
    "What about Nexxtmove?", # 4024
    "How to use Odoo for CRM?", # 4025
    "What about Officient employee self-service?", # 4026
    "What about the Onboarding To Do List?", # 4027
    "What about birth leave?", # 4028
    "What about dtsc.odoo.com?", # 4030
    "What about ProUnity?", # 4031
    "What about a hiring bonus?", # 4032
    "What about Powerdale?", # 4034
    "What about Single Permits?", # 4035
    "What about the BNP application?", # 4037
    "What about Elia?", # 4038
    "What about Subsidies?", # 4040
    "Who are our suppliers?", # 4041
    "What is TED?", # 4042
    "How to activate Music Streaming?", # 4043
    "What is Scrum for?", # 4046
    "How to add a Shared Mailbox?", # 4047
    "What about BNP Paribas warrants?" # 4048
]

# 4. Retrieve and Rerank Documents for Each Query Across All Models

Specify the models you want to test. 

In [None]:
retrieval_results = {}
for model_name, model in initialized_models.items():
    print(f"Running retrieval with model: {model_name}")
    retrieval_results[model_name] = {}
    
    for query in test_queries:
        print(f"  Processing query: {query[:30]}...")
        retrieval_results[model_name][query] = retrieve(
            query, model, supabase_client, function_name='match_documents_recency_no_filter', k=4, re_rank=True
        )
    
    print(f"✓ Completed all queries for {model_name}")

print("Retrieval complete for all models and queries.")

# 5. Process the Rankings
Define 'pattern' to match the 'created' date with regex.

In [None]:
# Initialize a list to collect data
data = []

# Iterate over each model and its corresponding queries
for model, queries in retrieval_results.items():
    for query, documents in queries.items():
        for doc in documents:
            created_datetime = extract_created_datetime(doc['content'], pattern=r'createdDateTime[":]*(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)')
            data.append((model, query, doc['rank'], doc['id'], created_datetime))


# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Model', 'Query', 'Rank', 'Document ID', 'Created DateTime'])

Define the date categories

In [None]:
# Define the date categories
date_categories = ['newest', 'newer', 'older', 'oldest']

# Sort and assign date categories within each group
df['date_category'] = (
    df.sort_values(by='Created DateTime', ascending=False)
    .groupby(['Model', 'Query'])
    .cumcount()
    .map({i: category for i, category in enumerate(date_categories)})
)

In [None]:
df.head()