In [None]:
!pip install --quiet langchain langchain-anthropic langchain-community faiss-cpu pypdf python-dotenv langchain_mistralai langchain_deepseek langchain_cohere asyncio psutil GPUtil supabase langdetect

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m41.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [None]:
#Importing the packages
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from math import log
from collections import Counter
import itertools
import re
import supabase
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings  # New package as per deprecation warning
from langchain.vectorstores import SupabaseVectorStore
from langchain.chat_models import ChatOpenAI  # For GPT-4 and as a placeholder for Mistral
from langchain.chat_models import ChatAnthropic  # For Claude (ensure compatibility with your LangChain version)
from langchain.schema import SystemMessage, HumanMessage
from langchain_anthropic import ChatAnthropic
import re
from datetime import datetime
from supabase import create_client, Client
from langchain_mistralai import ChatMistralAI
from langchain_cohere import ChatCohere
from langchain_deepseek import ChatDeepSeek
from langdetect import detect
import json


In [None]:
#REading the API key
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OpenAI')
supabase_key = userdata.get('Supabase_key') #vector store
MISTRAL_API_KEY = userdata.get('Mistral')
CLAUDE_API_KEY = userdata.get('Anthropic')
COHERE_API_KEY = userdata.get('Cohere')
DEEPSEEK_API_KEY = userdata.get('Deepseek_new')
GEMINI_API_KEY = userdata.get('Gemini')

In [None]:
#Connect to Supabase
 ### Supabase Credentials
SUPABASE_URL = ""
SUPABASE_KEY = supabase_key


# Create Supabase client
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

In [None]:
def get_embedding(text):
    """Get embeddings for a text using OpenAI's embedding model"""
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY,model="text-embedding-3-large")  # Use the appropriate model
    response = embeddings.embed_query(text)  # Correctly call the method to generate embeddings
    embedding = response
    return embedding

In [None]:
def retrieve(query, model_name, k=10, re_rank=False):
    """
    Retrieve top-k documents for a query using Supabase vector search with optional LLM re-ranking.

    Parameters:
      query (str): The search query.
      model_name (str): One of 'gpt-4o-mini', 'claude', 'mistral', etc.
      k (int): Number of top documents to retrieve.
      re_rank (bool): Whether to re-rank the documents using the LLM.

    Returns:
      List[dict]: A list of dictionaries with document 'id', 'rank', and 'content'.
    """

    # Initialize LLM
    model_name = model_name.lower()
    if model_name == "gpt-4o":
        llm = ChatOpenAI(model_name="gpt-4o-2024-11-20", openai_api_key=OPENAI_API_KEY)
    elif model_name == "claude":
        llm = ChatAnthropic(model="claude-3-7-sonnet-latest", anthropic_api_key=CLAUDE_API_KEY)
    elif model_name == "mistral":
        llm = ChatMistralAI(model="mistral-large-latest", mistral_api_key=MISTRAL_API_KEY)
    elif model_name == "cohere":
        llm = ChatCohere(model="command-a-03-2025", cohere_api_key=COHERE_API_KEY)
    elif model_name == "deepseek":
        import os
        os.environ["DEEPSEEK_API_KEY"] = DEEPSEEK_API_KEY
        llm = ChatDeepSeek(model="deepseek-v3-chat")
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    try:
        # Step 1: Get embedding
        query_embedding = get_embedding(query)

        # Step 2: Retrieve documents from Supabase via RPC
        response = supabase.rpc(
            'match_documents_language_no_filter',
            {
                'query_embedding': query_embedding,
                'match_count': k
            }
        ).execute()

        if not response.data or len(response.data) == 0:
            print("No relevant documents found.")
            return []

        # Step 3: Build document list
        docs = [
            type("Doc", (object,), {
                "id": doc.get("id"),
                "page_content": doc.get("content", ""),
                "metadata": doc.get("metadata", {})
            })()
            for doc in response.data
        ]

        actual_k = min(k, len(docs))

        # Step 4: Optional re-ranking
        if re_rank and actual_k > 1:
            try:
                documents_text = "\n\n".join([
                    f"Document {i+1} (ID: {docs[i].id}):\n{docs[i].page_content}"
                    for i in range(actual_k)
                ])

                prompt = f"""
                Query: {query}

                You will be given {actual_k} documents retrieved via semantic search.
                Your task is to re-rank these documents in order of their relevance to the query.
                Please return EXACTLY {actual_k} document numbers in order, from MOST to LEAST relevant,
                separated by commas (e.g., "3,1,2").

                Documents:
                {documents_text}
                """

                messages = [
                    SystemMessage(content="You are a helpful assistant skilled at ranking document relevance."),
                    HumanMessage(content=prompt)
                ]

                llm_response = llm.invoke(messages)
                ranking_text = llm_response.content.strip()
                ranking_order = [int(num.strip()) - 1 for num in re.findall(r'\d+', ranking_text)]

                if len(ranking_order) != actual_k or sorted(ranking_order) != list(range(actual_k)):
                    print(f"Invalid ranking received: {ranking_text}. Using default order.")
                    ranking_order = list(range(actual_k))

                docs = [docs[i] for i in ranking_order]

            except Exception as e:
                print(f"Re-ranking failed: {e}. Using initial ranking.")

        # Step 5: Return formatted result
        results = [
            {
                "id": doc.id,
                "rank": idx + 1,
                "content": doc.page_content
            }
            for idx, doc in enumerate(docs)
        ]
        return results

    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []


In [None]:
#sanity checks
results = retrieve(query="What are the cafeteria plan benefits?.", model_name="deepseek", k=4,re_rank=True)

In [None]:
results

In [None]:
# -------------------------------
# 4. Prepare 50 Test Queries
# -------------------------------
test_queries = [
    "How can I connect to Outlook Web?", # 3985
    "How can I access my Officient Calendar?", # 3986
    "Bannière?", # 3987
    "What are the cafeteria plan benefits?", # 3989
    "What about my car configuration offer?", # 3990
    "How do I create a Canva?", # 3991
    "What about chargemap business?", # 3992
    "What is the Moodboard?", # 3993
    "What about Chargemap (domicile)?", # 3994
    "What about Connecting Expertise?", # 3995
    "What's the BeCentral address?", # 3996
    "What about a Microsoft 365 license?", # 3997
    "What about Google Calendar", # 3998
    "How to modify a page on dtsc.be?", # 3999
    "What are compensatory rest days?", # 4000
    "How do I access the shared library?", # 4001
    "What is the login for StaffIT?", # 4003
    "How can I export contacts from Odoo?", # 4005
    "How can I export leads from Odoo?", # 4006
    "What is the structure for OneDrive?", # 4007
    "Who is responsible in case of a traffic fine?", # 4008
    "What about dtsc.be performance?", # 4010
    "What about mailing lists?", # 4011
    "What about a green card?", # 4014
    "Where is the Internship Agreement?", # 4015
    "What about the company credit card?", # 4016
    "How to create a teams meeting from Google Agenda?", # 4017
    "What about Supplementary Family Allowances?", # 4018
    "On what days does the company post on LinkedIn?", # 4019
    "What activities are included in the DTeam Spirit Challenge?", # 4020
    "What are the limits for the mobility budget?", # 4021
    "What about Nexxtmove?", # 4024
    "How to use Odoo for CRM?", # 4025
    "What about Officient employee self-service?" # 4026
    "What about the Onboarding To Do List?", # 4027
    "What about birth leave?", # 4028
    "What about dtsc.odoo.com?", # 4030
    "What about ProUnity?", # 4031
    "What about a hiring bonus?", # 4032
    "What about Powerdale?", # 4034
    "What about Single Permits?", # 4035
    "What about the BNP application?", # 4037
    "What about Elia?", # 4038
    "What about Subsidies?", # 4040
    "Who are our suppliers?", # 4041
    "What is TED?", # 4042
    "How to activate Music Streaming?", # 4043
    "What is Scrum for?", # 4046
    "How to add a Shared Mailbox?", # 4047
    "What about BNP Paribas warrants?" # 4048
]

In [None]:
# Ensure exactly 50 queries.
if len(test_queries) < 2:
    test_queries = test_queries *( 2// len(test_queries) + 1)
test_queries = test_queries[:2]
print("Prepared {} test queries.".format(len(test_queries)))

Prepared 2 test queries.


In [None]:
# -------------------------------
# 5. Retrieve Documents for Each Query Across All Models
# -------------------------------
# Initialize a list to collect data

models = ["gpt-4o-mini",  "claude", "mistral", "cohere", "deepseek"]
# retrieval_results structure: { model_name: { query: [list of document results] } }
retrieval_results = {model: {} for model in models}

for model in models:
    for query in test_queries:
        # Set re_rank=True if you wish to re-rank documents using the LLM.
        retrieval_results[model][query] = retrieve(query, model, k=4, re_rank=True)
print("Retrieval complete for all models and queries.")


In [None]:
def detect_language(content):
    """
    Detects the language of a given text content.

    Args:
        content (str): The text to analyze.

    Returns:
        str: Detected language code (e.g., 'en', 'fr', 'de', 'nl'), or None if detection fails.
    """
    try:
        return detect(content)
    except Exception:
        return None

In [None]:
# Initialize a list to collect data
data = []


for model, queries in retrieval_results.items():
    for query, documents in queries.items():
        for doc in documents:
            content = doc.get('content', '')
            if isinstance(content, str) and content.strip():
                language = detect_language(content)
            else:
                language = None
            data.append((model, query, doc['rank'], doc['id'], language))




# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Model', 'Query', 'Rank', 'Document ID', 'Language'])

In [None]:
# Set up the plot grid: 2x2 for ranks 1 to 4
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
#fig.suptitle("Language Category Distribution by Model for Each Rank", fontsize=16)

# Flatten axes array for easy indexing
axes = axes.flatten()

# Loop through Rank 1 to 4
for rank in range(1, 5):
    ax = axes[rank - 1]
    subset = df[df["Rank"] == rank]
    sns.countplot(data=subset, x="Language", hue="Model", order=["en", "fr", "nl", "de"], ax=ax, palette= model_palette)
    ax.set_title(f"Rank {rank}",fontsize = 12)
    ax.set_xlabel("Language", fontsize = 15)
    ax.set_ylabel("Count", fontsize = 15)

# Improve layout
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
fig.savefig("retrieval_language_distribution.png", dpi=300, bbox_inches='tight')

In [None]:
# Set up the plot grid: 2x2 for ranks 1 to 4
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
#fig.suptitle("Language Category Distribution by Model for Each Rank", fontsize=16)

# Flatten axes array for easy indexing
axes = axes.flatten()

# Loop through Rank 1 to 4
for rank in range(1, 5):
    ax = axes[rank - 1]
    subset = df[df["Rank"] == rank]
    sns.countplot(data=subset, x="Language", hue="Model", order=["en", "fr", "nl", "de"], ax=ax, palette= model_palette)
    ax.set_title(f"Rank {rank}",fontsize = 12)
    ax.set_xlabel("Language", fontsize = 15)
    ax.set_ylabel("Count", fontsize = 15)

# Improve layout
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
fig.savefig("retrieval_language_distribution.png", dpi=300, bbox_inches='tight')