In [1]:
!pip install -q openai anthropic cohere scikit-learn scipy matplotlib pandas supabase requests sentence-transformers



In [2]:
import openai
import anthropic
import cohere
import requests
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from scipy.stats import f_oneway
from supabase import create_client
import os
import time
from datetime import datetime
from google.colab import files
from IPython.display import display, HTML, clear_output

In [3]:
#REading the API key
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OpenAI')
supabase_key = userdata.get('Supabase_key') #vector store
MISTRAL_API_KEY = userdata.get('Mistral')
CLAUDE_API_KEY = userdata.get('Anthropic')
COHERE_API_KEY = userdata.get('Cohere')
DEEPSEEK_API_KEY = userdata.get('Deepseek_new')
GEMINI_API_KEY = userdata.get('Gemini')

In [4]:
 ### Supabase Credentials
SUPABASE_URL = "https://rrjbrtbsvdoxndchvchq.supabase.co"
SUPABASE_KEY = supabase_key

# Initialize API clients
openai.api_key = OPENAI_API_KEY
anthropic_client = anthropic.Anthropic(api_key=CLAUDE_API_KEY)
cohere_client = cohere.Client(COHERE_API_KEY)

# Create Supabase client
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

In [6]:

# @title ⚙️ Step 4: Configure Experiment
# @markdown Select which models to include in your experiment:

use_claude = True # @param {type:"boolean"}
use_openai = True # @param {type:"boolean"}
use_mistral = True # @param {type:"boolean"}
use_cohere = True # @param {type:"boolean"}
use_deepseek = True # @param {type:"boolean"}

# @markdown Define the test questions:
question_text = "What is the Cafeteria plan?" # @param {type:"string"}
num_iterations = 5 # @param {type:"slider", min:1, max:10, step:1}

# @markdown Advanced settings:
top_k_docs = 1 # @param {type:"slider", min:1, max:10, step:1}
temperature = 0.7 # @param {type:"slider", min:0, max:1, step:0.1}
match_threshold = 0.5 # @param {type:"slider", min:0, max:1, step:0.1}

# Define LLM models
llm_models = {}

if use_claude:
    llm_models["Claude-3-Sonnet"] = {"provider": "anthropic", "model": "claude-3-7-sonnet-latest"}
if use_openai:
    llm_models["GPT-4"] = {"provider": "openai", "model": "gpt-4o-mini"}
if use_mistral:
    llm_models["Mistral-Large"] = {"provider": "mistral", "model": "mistral-large-latest"}
if use_cohere:
    llm_models["Cohere-Command"] = {"provider": "cohere", "model": "command-r-plus"}
if use_deepseek:
    llm_models["DeepSeek-Chat"] = {"provider": "deepseek", "model": "deepseek-chat"}

In [7]:
# Generate the list of test questions
questions = [question_text] * num_iterations

# Initialize sentence transformer model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# @title 🔍 Step 5: Vector Search Function
def retrieve_context(query, top_k=top_k_docs):
    """
    Retrieve relevant documents from Supabase vector store
    """
    # Generate embedding for the query
    query_embedding = embedding_model.encode(query).tolist()

    try:
        # Perform vector similarity search in Supabase
        # Note: You may need to adjust the function name and parameters based on your Supabase setup
        response = supabase.rpc(
            'match_documents',  # Your stored procedure for similarity search
            {
                'query_embedding': query_embedding,
                'match_threshold': match_threshold,
                'match_count': top_k
            }
        ).execute()

        if not response.data or len(response.data) == 0:
            return "No relevant context found in the knowledge base."

        # Format retrieved documents as context
        context = "Context from knowledge base:\n\n"
        for i, doc in enumerate(response.data):
            # Adjust field names to match your Supabase table schema
            content = doc.get('content', '')
            source = doc.get('source', 'Unknown source')
            context += f"Document {i+1}: {content}\nSource: {source}\n\n"

        return context

    except Exception as e:
        print(f"Error retrieving from vector store: {e}")
        return "Error retrieving context from knowledge base."

In [9]:
# @title 🧠 Step 6: Query LLM with RAG for Different Providers
def query_llm_with_rag(llm_config, query):
    provider = llm_config["provider"]
    model = llm_config["model"]

    # Retrieve relevant context from Supabase
    context = retrieve_context(query)

    # Construct RAG prompt with retrieved context
    rag_prompt = f"""
{context}

Based on the above context from our knowledge base, please answer the following question:
{query}

If the context doesn't contain relevant information to answer the question,
please say so and answer based on your general knowledge.
"""

    try:
        # Call the appropriate API based on provider
        if provider == "openai":
            response = openai.ChatCompletion.create(
                model=model,
                messages=[{"role": "user", "content": rag_prompt}],
                temperature=temperature
            )
            return response['choices'][0]['message']['content'].strip()

        elif provider == "anthropic":
            response = anthropic_client.messages.create(
                model=model,
                messages=[{"role": "user", "content": rag_prompt}],
                max_tokens=1000,
                temperature=temperature
            )
            return response.content[0].text

        elif provider == "cohere":
            response = cohere_client.chat(
                message=rag_prompt,
                model=model,
                temperature=temperature
            )
            return response.text

        elif provider == "mistral":
            headers = {
                "Content-Type": "application/json",
                "Accept": "application/json",
                "Authorization": f"Bearer {MISTRAL_API_KEY}"
            }
            payload = {
                "model": model,
                "messages": [{"role": "user", "content": rag_prompt}],
                "temperature": temperature
            }
            response = requests.post(
                "https://api.mistral.ai/v1/chat/completions",
                headers=headers,
                json=payload
            )
            return response.json()["choices"][0]["message"]["content"].strip()

        elif provider == "deepseek":
            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
            }
            payload = {
                "model": model,
                "messages": [{"role": "user", "content": rag_prompt}],
                "temperature": temperature
            }
            response = requests.post(
                "https://api.deepseek.com/v1/chat/completions",
                headers=headers,
                json=payload
            )
            return response.json()["choices"][0]["message"]["content"].strip()

        else:
            return f"Error: Unsupported provider {provider}"

    except Exception as e:
        return f"Error with {provider} ({model}): {str(e)}"

In [10]:

# @title 📈 Step 7: Drift Calculation
def compute_response_drift_score(responses):
    if len(responses) < 2:
        return 0, []

    base_embedding = embedding_model.encode(responses[0])
    scores = []
    for r in responses[1:]:
        emb = embedding_model.encode(r)
        sim = cosine_similarity([base_embedding], [emb])[0][0]
        drift_score = 1 - sim
        scores.append(drift_score)
    return np.mean(scores), scores

In [11]:


# @title 🧪 Step 8: Run Experiment with User Feedback (Colab-friendly)
def run_experiment():
    response_logs = {llm: [] for llm in llm_models}
    feedback_logs = {llm: [] for llm in llm_models}
    log_df = []

    for i, question in enumerate(questions):
        print(f"\n📌 Iteration {i+1}/{len(questions)} — Question: {question}")

        for llm, config in llm_models.items():
            print(f"\n🤖 {llm} is generating a response...")
            try:
                response = query_llm_with_rag(config, question)
                print(f"🗨️ {llm} says:\n{response}\n")

                # Create HTML buttons for feedback
                display(HTML(f"""
                <div style="padding: 10px; border-radius: 5px;">
                <p>What's your feedback on {llm}'s response?</p>
                <button id="pos_btn_{i}_{llm}" style="background-color: #4CAF50; color: white; border: none; padding: 10px 20px; margin: 5px; cursor: pointer;"
                onclick="
                    document.getElementById('feedback_value_{i}_{llm}').value = 'positive';
                    document.getElementById('feedback_form_{i}_{llm}').submit();
                ">Positive</button>

                <button id="neu_btn_{i}_{llm}" style="background-color: #FFC107; color: white; border: none; padding: 10px 20px; margin: 5px; cursor: pointer;"
                onclick="
                    document.getElementById('feedback_value_{i}_{llm}').value = 'neutral';
                    document.getElementById('feedback_form_{i}_{llm}').submit();
                ">Neutral</button>

                <button id="neg_btn_{i}_{llm}" style="background-color: #F44336; color: white; border: none; padding: 10px 20px; margin: 5px; cursor: pointer;"
                onclick="
                    document.getElementById('feedback_value_{i}_{llm}').value = 'negative';
                    document.getElementById('feedback_form_{i}_{llm}').submit();
                ">Negative</button>

                <form id="feedback_form_{i}_{llm}" style="display:none">
                    <input id="feedback_value_{i}_{llm}" name="feedback">
                </form>
                </div>
                """))

                # Get feedback using standard input as fallback
                feedback = input(f"👍👎 Type feedback for {llm} (positive/neutral/negative): ").strip().lower()
                while feedback not in ["positive", "neutral", "negative"]:
                    feedback = input("Please enter 'positive', 'neutral', or 'negative': ").strip().lower()

                response_logs[llm].append(response)
                feedback_logs[llm].append(feedback)

                log_df.append({
                    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "iteration": i+1,
                    "llm": llm,
                    "provider": config["provider"],
                    "model": config["model"],
                    "question": question,
                    "response": response,
                    "feedback": feedback
                })

                # Add a small delay to avoid rate limits
                time.sleep(1)

            except Exception as e:
                print(f"❌ Error with {llm}: {e}")
                log_df.append({
                    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "iteration": i+1,
                    "llm": llm,
                    "provider": config["provider"],
                    "model": config["model"],
                    "question": question,
                    "response": f"ERROR: {str(e)}",
                    "feedback": "error"
                })

    return response_logs, feedback_logs, log_df

In [12]:


# @title 🧮 Step 9: Drift Analysis
def analyze_drift(response_logs):
    print("\n📊 Calculating Response Drift Scores...")
    drift_logs = {}

    for llm in llm_models:
        # Skip if we don't have enough responses
        if len(response_logs[llm]) < 2:
            print(f"{llm}: Not enough responses to calculate drift")
            drift_logs[llm] = []
            continue

        avg_drift, all_drifts = compute_response_drift_score(response_logs[llm])
        drift_logs[llm] = all_drifts
        print(f"{llm} Average Drift: {avg_drift:.4f}")

    # Perform ANOVA only on models with sufficient data
    valid_models = [llm for llm in llm_models if len(drift_logs[llm]) > 1]
    if len(valid_models) > 1:
        anova_data = [drift_logs[llm] for llm in valid_models if len(drift_logs[llm]) > 0]
        if len(anova_data) > 1 and all(len(d) > 0 for d in anova_data):
            anova_result = f_oneway(*anova_data)
            print(f"\n📈 ANOVA F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")
            if anova_result.pvalue < 0.05:
                print("🎯 Statistically significant difference in drift detected!")
            else:
                print("✅ No significant difference in drift across LLMs.")

    # Plot drift over time
    plt.figure(figsize=(12, 6))
    for llm in drift_logs:
        if len(drift_logs[llm]) > 0:  # Only plot if we have data
            plt.plot(range(1, len(drift_logs[llm])+1), drift_logs[llm], marker='o', label=llm)

    plt.xlabel("Iteration")
    plt.ylabel("Drift Score (1 - cosine similarity)")
    plt.title("Response Drift Over Time Across Different LLMs")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("drift_analysis.png")  # Save figure
    plt.show()

    return drift_logs

In [13]:


# @title 📊 Step 10: Feedback Visualization
def visualize_feedback(feedback_logs):
    # Convert feedback to numeric values
    feedback_values = {"positive": 1, "neutral": 0, "negative": -1, "error": None}

    feedback_data = {}
    for llm, feedback in feedback_logs.items():
        feedback_data[llm] = [feedback_values.get(f) for f in feedback if feedback_values.get(f) is not None]

    # Plot average feedback scores
    plt.figure(figsize=(10, 6))
    avg_scores = {llm: np.mean(scores) if scores else 0 for llm, scores in feedback_data.items()}

    models = list(avg_scores.keys())
    scores = list(avg_scores.values())

    # Sort by score
    sorted_indices = np.argsort(scores)
    sorted_models = [models[i] for i in sorted_indices]
    sorted_scores = [scores[i] for i in sorted_indices]

    bars = plt.barh(sorted_models, sorted_scores, color=['red' if s < 0 else 'green' if s > 0 else 'gray' for s in sorted_scores])
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.xlabel("Average Feedback Score (-1: Negative, 0: Neutral, 1: Positive)")
    plt.title("Average User Feedback by LLM")
    plt.xlim(-1.1, 1.1)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig("feedback_analysis.png")
    plt.show()

In [14]:


# @title 💾 Step 11: Export Logs
def export_logs(log_df):
    df = pd.DataFrame(log_df)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"llm_rag_drift_log_{timestamp}.csv"
    df.to_csv(filename, index=False)
    print(f"\n✅ Log saved to '{filename}'")

    # For Google Colab: Allow downloading the file
    try:
        files.download(filename)
    except:
        print(f"To download the log file, use the Files panel on the left sidebar.")

    return df


In [15]:

# @title 📝 Step 12: Generate Summary Report
def generate_report(df, drift_logs):
    print("\n📋 Generating Summary Report...")

    # Calculate statistics per model
    model_stats = {}
    for llm in llm_models:
        model_data = df[df['llm'] == llm]
        if len(model_data) == 0:
            continue

        # Calculate feedback counts
        feedback_counts = model_data['feedback'].value_counts().to_dict()
        if 'positive' not in feedback_counts: feedback_counts['positive'] = 0
        if 'neutral' not in feedback_counts: feedback_counts['neutral'] = 0
        if 'negative' not in feedback_counts: feedback_counts['negative'] = 0
        if 'error' not in feedback_counts: feedback_counts['error'] = 0

        # Calculate feedback score (-1 to 1)
        total_ratings = feedback_counts['positive'] + feedback_counts['neutral'] + feedback_counts['negative']
        if total_ratings > 0:
            feedback_score = (feedback_counts['positive'] - feedback_counts['negative']) / total_ratings
        else:
            feedback_score = 0

        # Calculate average drift
        avg_drift = np.mean(drift_logs[llm]) if len(drift_logs[llm]) > 0 else 0

        model_stats[llm] = {
            'provider': llm_models[llm]['provider'],
            'model': llm_models[llm]['model'],
            'positive_feedback': feedback_counts['positive'],
            'neutral_feedback': feedback_counts['neutral'],
            'negative_feedback': feedback_counts['negative'],
            'errors': feedback_counts['error'],
            'feedback_score': feedback_score,
            'avg_drift': avg_drift
        }

    # Create summary dataframe
    summary_df = pd.DataFrame.from_dict(model_stats, orient='index').reset_index()
    summary_df = summary_df.rename(columns={'index': 'llm'})

    # Sort by feedback score (descending)
    summary_df = summary_df.sort_values('feedback_score', ascending=False)

    # Save summary
    summary_filename = "llm_performance_summary.csv"
    summary_df.to_csv(summary_filename, index=False)
    print(f"✅ Summary report saved to '{summary_filename}'")

    # For Google Colab: Allow downloading the file
    try:
        files.download(summary_filename)
    except:
        print(f"To download the summary file, use the Files panel on the left sidebar.")

    # Print summary
    print("\n📊 LLM Performance Summary:")
    display(summary_df[['llm', 'positive_feedback', 'neutral_feedback', 'negative_feedback', 'feedback_score', 'avg_drift']])

    return summary_df

In [18]:
def main():
    print("🔄 Starting Multi-LLM RAG Drift Analysis Experiment")

    # Check if API keys are provided
    missing_keys = []
    if use_claude and not CLAUDE_API_KEY: missing_keys.append("Anthropic (Claude)")
    if use_openai and not OPENAI_API_KEY: missing_keys.append("OpenAI")
    if use_mistral and not MISTRAL_API_KEY: missing_keys.append("Mistral")
    if use_cohere and not COHERE_API_KEY: missing_keys.append("Cohere")
    if use_deepseek and not DEEPSEEK_API_KEY: missing_keys.append("DeepSeek")

    if missing_keys:
        print(f"❌ Missing API keys for: {', '.join(missing_keys)}")
        print("Please provide the required API keys and run again.")
        return

    # Check if any models are selected
    if not llm_models:
        print("❌ No LLM models selected. Please select at least one model.")
        return

    # Check if Supabase connection works
    if not SUPABASE_URL or not SUPABASE_KEY:
        print("❌ Missing Supabase credentials. Please provide URL and key.")
        return

    try:
        test_query = "test connection"
        test_context = retrieve_context(test_query, top_k=1)
        print("✅ Supabase connection successful")
    except Exception as e:
        print(f"❌ Supabase connection failed: {e}")
        print("Please check your Supabase credentials and try again.")
        return

    # Print active models
    print("\n🤖 Active models for this experiment:")
    for llm, config in llm_models.items():
        print(f"  - {llm} ({config['provider']}: {config['model']})")

    # Run the experiment
    start_time = time.time()
    response_logs, feedback_logs, log_df = run_experiment()

    # Export raw logs
    df = export_logs(log_df)

    # Calculate and display execution time
    execution_time = time.time() - start_time
    print(f"\n✅ Experiment completed in {execution_time:.2f} seconds")

    return df



🔄 Starting Multi-LLM RAG Drift Analysis Experiment


SyntaxError: 'return' outside function (<ipython-input-18-58ce9e2fbac2>, line 15)

In [19]:
# Execute the main function if this script is run directly
if __name__ == "__main__":
    main()

🔄 Starting Multi-LLM RAG Drift Analysis Experiment
Error retrieving from vector store: {'code': 'PGRST202', 'details': 'Searched for the function public.match_documents with parameters match_count, match_threshold, query_embedding or with a single unnamed json/jsonb parameter, but no matches were found in the schema cache.', 'hint': 'Perhaps you meant to call the function public.match_documents(filter, match_count, query_embedding)', 'message': 'Could not find the function public.match_documents(match_count, match_threshold, query_embedding) in the schema cache'}
✅ Supabase connection successful

🤖 Active models for this experiment:
  - Claude-3-Sonnet (anthropic: claude-3-7-sonnet-latest)
  - GPT-4 (openai: gpt-4o-mini)
  - Mistral-Large (mistral: mistral-large-latest)
  - Cohere-Command (cohere: command-r-plus)
  - DeepSeek-Chat (deepseek: deepseek-chat)

📌 Iteration 1/5 — Question: What is the Cafeteria plan?

🤖 Claude-3-Sonnet is generating a response...
Error retrieving from vecto

KeyboardInterrupt: Interrupted by user