# Tutorial Retrieval Recency Bias



#### Imports

In [None]:
import pandas as pd
import os
import re
from dotenv import load_dotenv
from unbiasai.config import DATA_DIR
from unbiasai.utils import initialize_llm, generate_embeddings, insert_documents, retrieve, get_documents_from_supabase, convert_to_doc_objects, create_reranking_prompt, perform_llm_reranking, format_results, extract_created_datetime
from supabase import create_client, Client
from unbiasai.connection import create_supabase_client
from dtsc_queries.retrieval_recency import test_queries
from langchain.schema import SystemMessage, HumanMessage
from datetime import datetime

from unbiasai.config import ENVFILE
load_dotenv(ENVFILE)

## 1. Load data
Set the path and read your CSV data.

In [2]:
file_path = DATA_DIR / 'recency_test.csv'
df = pd.read_csv(file_path)

Optional: Create subset of data for test

In [3]:
# df = df.head(15)

Define and initialize LLMs and read your own API Keys

In [None]:
models = ["gpt", "claude", "mistral", "cohere", "deepseek"]
models = ["gpt", "mistral", "cohere", "deepseek"]
initialized_models = {}

for model_name in models:
    initialized_models[model_name] = initialize_llm(model_name)

Initializing model: gpt
Error initializing gpt: initialize_llm() takes 1 positional argument but 2 were given
Initializing model: claude
Error initializing claude: initialize_llm() takes 1 positional argument but 2 were given
Initializing model: mistral
Error initializing mistral: initialize_llm() takes 1 positional argument but 2 were given
Initializing model: cohere
Error initializing cohere: initialize_llm() takes 1 positional argument but 2 were given
Initializing model: deepseek
Error initializing deepseek: initialize_llm() takes 1 positional argument but 2 were given


## 2. Connect to Supabase and create a Vector Store
Connect to Supabase

In [None]:
url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase_client: Client = create_client(url, key)

Apply the generate_embedding function to your data.

In [None]:
df['embedding'] = df['content'].apply(generate_embeddings)

OpenAI API Key: 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
OpenAI API Key: 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
OpenAI API Key: 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
OpenAI API Key: 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
OpenAI API Key: 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
OpenAI API Key: sk-proj-sydGutdi-xFTPm8-jdUsWWnuwCvWmvs27DMKipOHcGtMbl0ueubFukRD-4N_yHONahgsCLe

Insert your new df including the embeddings in the supabase table to create a vector store.

In [8]:
insert_documents(df, supabase_client)
# IMPORTANT: change function so supabase table name can be changed.

Inserting document with ID: 0
Inserting document with ID: 1
Inserting document with ID: 2
Inserting document with ID: 3
Inserting document with ID: 10
Inserting document with ID: 11
Inserting document with ID: 12
Inserting document with ID: 13
Inserting document with ID: 20
Inserting document with ID: 21
Inserting document with ID: 22
Inserting document with ID: 23
Inserting document with ID: 30
Inserting document with ID: 31
Inserting document with ID: 32
Inserting document with ID: 33
Inserting document with ID: 40
Inserting document with ID: 41
Inserting document with ID: 42
Inserting document with ID: 43


# 3. Define your Test Queries

In [9]:
test_queries = [
    "What is test query 1?",
    "What is test query 2?",
    "What is test query 3?",
]

# 4. Retrieve and Rerank Documents for Each Query Across All Models

Specify the models you want to test. 

In [10]:
retrieval_results = {}
for model_name, model in initialized_models.items():
    print(f"Running retrieval with model: {model_name}")
    retrieval_results[model_name] = {}
    
    for query in test_queries:
        print(f"  Processing query: {query[:30]}...")
        retrieval_results[model_name][query] = retrieve(
            query, model, supabase_client, function_name='match_documents_recency_no_filter', k=4, re_rank=True
        )
    
    print(f"✓ Completed all queries for {model_name}")

print("Retrieval complete for all models and queries.")

Retrieval complete for all models and queries.


# 5. Process the Rankings
Define 'pattern' to match the 'created' date with regex.

In [11]:
# Initialize a list to collect data
data = []

# Iterate over each model and its corresponding queries
for model, queries in retrieval_results.items():
    for query, documents in queries.items():
        for doc in documents:
            created_datetime = extract_created_datetime(doc['content'], pattern=r'createdDateTime[":]*(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)')
            data.append((model, query, doc['rank'], doc['id'], created_datetime))


# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Model', 'Query', 'Rank', 'Document ID', 'Created DateTime'])

Define the date categories

In [12]:
# Define the date categories
date_categories = ['newest', 'newer', 'older', 'oldest']

# Sort and assign date categories within each group
df['date_category'] = (
    df.sort_values(by='Created DateTime', ascending=False)
    .groupby(['Model', 'Query'])
    .cumcount()
    .map({i: category for i, category in enumerate(date_categories)})
)

In [13]:
df.head()

Unnamed: 0,Model,Query,Rank,Document ID,Created DateTime,date_category
