# Tutorial Retrieval Recency Bias



#### Imports

In [1]:
import pandas as pd
import os
import re
from dotenv import load_dotenv
from unbiasai.config import DATA_DIR
from unbiasai.utils import initialize_llm, generate_embeddings, insert_documents, retrieve, get_documents_from_supabase, convert_to_doc_objects, create_reranking_prompt, perform_llm_reranking, format_results, extract_created_datetime
from supabase import create_client, Client
from unbiasai.connection import create_supabase_client
from dtsc_queries.retrieval_recency import test_queries
from langchain.schema import SystemMessage, HumanMessage
from datetime import datetime


## 1. Load data
Set the path and read your CSV data.

In [2]:
file_path = DATA_DIR / 'dataset_retrieval_recency.csv'
df = pd.read_csv(file_path)

Optional: Create subset of data for test

In [3]:
df = df.head(15)

Define and initialize LLMs and read your own API Keys

In [4]:
# Load environment variables
load_dotenv()

# Define models and queries
models = ["gpt", "claude", "mistral", "cohere", "deepseek"]
initialized_models = {}
for model_name in models:
    print(f"Initializing model: {model_name}")
    
    # Get appropriate API key for each model
    if model_name == "gpt":
        api_key = os.getenv("OPENAI_API_KEY")
    elif model_name == "claude":
        api_key = os.getenv("ANTHROPIC_API_KEY")
    elif model_name == "mistral":
        api_key = os.getenv("MISTRAL_API_KEY")
    elif model_name == "cohere":
        api_key = os.getenv("COHERE_API_KEY")
    elif model_name == "deepseek":
        api_key = os.getenv("DEEPSEEK_API_KEY")
    else:
        print(f"Skipping unknown model: {model_name}")
        continue
    
    if not api_key:
        print(f"Warning: No API key found for {model_name}, skipping.")
        continue
    
    try:
        # Initialize and store the model
        initialized_models[model_name] = initialize_llm(model_name, api_key)
        print(f"✓ Successfully initialized {model_name}")
    except Exception as e:
        print(f"Error initializing {model_name}: {e}")

Initializing model: gpt
LLM initialized correctly: gpt, llm: client=<openai.resources.chat.completions.completions.Completions object at 0x1131cb620> async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x1131d7e00> root_client=<openai.OpenAI object at 0x112f7e270> root_async_client=<openai.AsyncOpenAI object at 0x1131cb770> model_name='gpt-4o-2024-11-20' model_kwargs={} openai_api_key=SecretStr('**********')
✓ Successfully initialized gpt
Initializing model: claude
Initializing model: mistral
Error initializing mistral: name 'ChatMistralAI' is not defined
Initializing model: cohere
LLM initialized correctly: cohere, llm: client=<cohere.client.Client object at 0x1131e8050> async_client=<cohere.client.AsyncClient object at 0x1131e8d70> model='command-a-03-2025' cohere_api_key=SecretStr('**********')
✓ Successfully initialized cohere
Initializing model: deepseek
LLM initialized correctly: deepseek, llm: client=<openai.resources.chat.completions.completio

Load Supabase Key

In [5]:
load_dotenv()
SUPABASE_KEY = os.getenv("SUPABASE_KEY")

## 2. Connect to Supabase and create a Vector Store
Connect to Supabase

In [6]:
# Create Supabase client
SUPABASE_URL = "https://wuxtoyrimqwohizxcmzf.supabase.co"
supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)

Apply the generate_embedding function to your data.

In [7]:
df['embedding'] = df['content'].apply(generate_embeddings)

Insert your new df including the embeddings in the supabase table to create a vector store.

In [8]:
insert_documents(df, supabase_client)
# IMPORTANT: change function so supabase table name can be changed.

Inserting document with ID: 3984
Inserting document with ID: 3984
Inserting document with ID: 3984
Inserting document with ID: 3984
Inserting document with ID: 3985
Inserting document with ID: 3986
Inserting document with ID: 3986
Inserting document with ID: 3986
Inserting document with ID: 3986
Inserting document with ID: 3987
Inserting document with ID: 3987
Inserting document with ID: 3987
Inserting document with ID: 3987
Inserting document with ID: 3988
Inserting document with ID: 3989


# 3. Define your Test Queries

In [9]:
test_queries = [
    "What is test query 1?",
    "What is test query 2?",
    "What is test query 3?",
]

# 4. Retrieve and Rerank Documents for Each Query Across All Models

Specify the models you want to test. 

In [10]:
retrieval_results = {}
for model_name, model in initialized_models.items():
    print(f"Running retrieval with model: {model_name}")
    retrieval_results[model_name] = {}
    
    for query in test_queries:
        print(f"  Processing query: {query[:30]}...")
        retrieval_results[model_name][query] = retrieve(
            query, model, supabase_client, function_name='match_documents_recency_no_filter', k=4, re_rank=True
        )
    
    print(f"✓ Completed all queries for {model_name}")

print("Retrieval complete for all models and queries.")

Running retrieval with model: gpt
  Processing query: What is test query 1?...
  Processing query: What is test query 2?...
  Processing query: What is test query 3?...
✓ Completed all queries for gpt
Running retrieval with model: cohere
  Processing query: What is test query 1?...
Re-ranking failed: status_code: 402, body: {'message': 'Please add or update your payment method at https://dashboard.cohere.com/billing?tab=payment to continue'}. Using initial ranking.
  Processing query: What is test query 2?...
Re-ranking failed: status_code: 402, body: {'message': 'Please add or update your payment method at https://dashboard.cohere.com/billing?tab=payment to continue'}. Using initial ranking.
  Processing query: What is test query 3?...
Re-ranking failed: status_code: 402, body: {'message': 'Please add or update your payment method at https://dashboard.cohere.com/billing?tab=payment to continue'}. Using initial ranking.
✓ Completed all queries for cohere
Running retrieval with model: d

# 5. Process the Rankings
Define 'pattern' to match the 'created' date with regex.

In [11]:
# Initialize a list to collect data
data = []

# Iterate over each model and its corresponding queries
for model, queries in retrieval_results.items():
    for query, documents in queries.items():
        for doc in documents:
            created_datetime = extract_created_datetime(doc['content'], pattern=r'createdDateTime[":]*(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)')
            data.append((model, query, doc['rank'], doc['id'], created_datetime))


# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Model', 'Query', 'Rank', 'Document ID', 'Created DateTime'])

Define the date categories

In [12]:
# Define the date categories
date_categories = ['newest', 'newer', 'older', 'oldest']

# Sort and assign date categories within each group
df['date_category'] = (
    df.sort_values(by='Created DateTime', ascending=False)
    .groupby(['Model', 'Query'])
    .cumcount()
    .map({i: category for i, category in enumerate(date_categories)})
)