In [None]:
!pip install -q transformers torch faiss-cpu langchain sentence-transformers huggingface_hub openai google-generativeai langchain-huggingface

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## 1. SETUP AND CONFIGURATION

In [None]:
import json
import os
import random
import re
from typing import List, Dict, Any,Tuple, Optional, Set
from pathlib import Path
import logging
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import requests
from google.colab import userdata
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
import faiss
import openai
import google.generativeai as genai
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

In [None]:



Config = {
    "hf_token_name": "HF_TOKEN",
    "openai_api_key_name": "OPENAI_TOKEN",
    "google_api_key_name": "GOOGLE_TOKEN",

    # Change this value to 'huggingface', 'openai', or 'google'
    "active_llm_provider": "openai",

    "huggingface_model": "meta-llama/Meta-Llama-3-8B-Instruct",
    "openai_model": "gpt-5-nano",
    "gemini_model": "gemini-1.5-flash-latest",

    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "personachat_url": "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json",
    "processed_personachat_file": "personachat_processed.jsonl",
    "eval_results_file": "evaluation_results_with_scores.jsonl",
    "api_max_tokens": 250,
    "api_temperature": 0.7,
}

print(f"✅ Setup complete. Active LLM Provider: {Config['active_llm_provider'].upper()}")


🚀 [1/7] Installing libraries and setting up configuration...
✅ Setup complete. Active LLM Provider: OPENAI


### 2. UNIFIED LLM CLIENT (PROVIDER SWITCHER LOGIC)

In [None]:
print("\n🚀 [2/7] Defining the unified LLMClient wrapper...")

class LLMClient:
    """A unified wrapper to handle different LLM provider APIs."""
    def __init__(self, provider: str):
        self.provider = provider
        if self.provider not in ['huggingface', 'openai', 'google']:
            raise ValueError("Provider must be 'huggingface', 'openai', or 'google'")

        if self.provider == 'huggingface':
            hf_token = userdata.get(Config["hf_token_name"])
            if not hf_token: raise ValueError("Hugging Face token not found in secrets.")
            self.client = InferenceClient(token=hf_token)
            self.model_name = Config["huggingface_model"]
        elif self.provider == 'openai':
            openai.api_key = Config["openai_api_key_name"]
            if not openai.api_key: raise ValueError("OpenAI API key not found in secrets.")
            self.client = openai.OpenAI(api_key=openai.api_key) # Use the retrieved API key
            self.model_name = Config["openai_model"]
        elif self.provider == 'google':
            google_api_key = Config["google_api_key_name"]
            if not google_api_key: raise ValueError("Google API key not found in secrets.")
            genai.configure(api_key=google_api_key)
            self.client = genai.GenerativeModel(Config["gemini_model"])

    def chat_completion(self, messages: List[Dict[str, str]], **kwargs) -> str:
        """Calls the appropriate API and returns a normalized string response."""
        try:
            if self.provider == 'huggingface':
                response = self.client.chat_completion(messages=messages, model=self.model_name, **kwargs)
                return response.choices[0].message.content
            elif self.provider == 'openai':
                response = self.client.chat.completions.create(model=self.model_name, messages=messages, **kwargs)
                return response.choices[0].message.content
            elif self.provider == 'google':
                if messages[0]['role'] == 'system':
                    system_prompt = messages.pop(0)['content']
                    messages[0]['content'] = f"{system_prompt}\n\nUser Question: {messages[0]['content']}"
                gemini_messages = [msg['content'] for msg in messages]
                response = self.client.generate_content(gemini_messages)
                return response.text
        except Exception as e:
            return f"[API Error ({self.provider})]: {str(e)[:100]}..."

print("✅ LLMClient defined.")


🚀 [2/7] Defining the unified LLMClient wrapper...
✅ LLMClient defined.


### 3. DATA ACQUISITION AND PREPROCESSING

#### Persona chat

In [None]:
print("\n🚀 [3/7] Acquiring and preprocessing PersonaChat data...")

def clean_text(text: Any) -> Any:
    if not isinstance(text, str): return text
    return re.sub(r'\s+', ' ', text.strip())

def process_personachat_data(url: str, output_file: str):
    if os.path.exists(output_file):
        print(f"✅ Data file '{output_file}' already exists.")
        return
    try:
        data = requests.get(url).json()
        processed_examples = []
        for split in ['train', 'valid']:
            for dialog in data.get(split, []):
                utterances = dialog.get('utterances', [])
                if utterances:
                    history = utterances[-1].get('history', [])
                    if history:
                        processed_examples.append({
                            'persona': [clean_text(p) for p in dialog.get('personality', [])],
                            'full_history': [clean_text(h) for h in history]
                        })
        with open(output_file, 'w', encoding='utf-8') as f:
            for item in processed_examples: f.write(json.dumps(item) + '\n')
        print(f"✅ Saved {len(processed_examples)} examples to {output_file}")
    except Exception as e: print(f"Error processing data: {e}")

process_personachat_data(Config["personachat_url"], Config["processed_personachat_file"])


🚀 [3/7] Acquiring and preprocessing PersonaChat data...
✅ Saved 18878 examples to personachat_processed.jsonl


#### Our longterm dataset

In [None]:
json_data = {
  "persona": {
    "name": "Alex",
    "persona_facts": [
        "name is Alex",
      "I am a 32-year-old freelance graphic designer.",
      "I am deeply curious about the capabilities and limitations of AI.",
      "I am tech-savvy, but not a programmer.",
      "My interest is in the practical application and conversational fluency of AI models.",
      "I am patient and methodical.",
      "I enjoy testing the boundaries of technology with creative and follow-up questions.",
      "I am evaluating a new AI assistant for a potential long-term subscription.",
      "My goal is to test an AI's ability to retain information in the short-term.",
      "My goal is to test an AI's ability to recall information from previous sessions (long-term memory).",
      "My goal is to test an AI's ability to explain complex topics simply."
    ]
  },
  "dialogue": [
    {
      "turn": 1,
      "speaker": "user",
      "text": "Hi there! I'm thinking of planning a trip. My favorite color is sunset orange. Can you suggest three travel destinations known for their beautiful sunsets?"
    },
    {
      "turn": 2,
      "speaker": "user",
      "text": "Thanks. Of those three, which one is most known for its historical sites? My partner, Jamie, is a huge history buff."
    },
    {
      "turn": 3,
      "speaker": "user",
      "text": "Interesting. What's the best time of year to visit that specific location? We're looking to travel in the spring."
    },
    {
      "turn": 4,
      "speaker": "user",
      "text": "Okay, good to know. Now, can you find me a highly-rated seafood restaurant there that would be a good fit for a special occasion? Jamie loves seafood."
    },
    {
      "turn": 5,
      "speaker": "user",
      "text": "That sounds perfect. Could you write a sample itinerary for a 3-day trip that includes a visit to a key historical site and dinner at that restaurant?"
    },
    {
      "turn": 6,
      "speaker": "user",
      "text": "This looks great. Quick question, what was the first destination you mentioned when we started talking?"
    },
    {
      "turn": 7,
      "speaker": "user",
      "text": "And what did I say my partner's name was?"
    },
    {
      "turn": 8,
      "speaker": "user",
      "text": "One last thing, what was the color I told you was my favorite at the very beginning of our chat?"
    },
    {
      "turn": 9,
      "speaker": "user",
      "text": "Hey, it's me again. We talked last week about planning a trip for me and Jamie. Do you remember where we decided to go?"
    },
    {
      "turn": 10,
      "speaker": "user",
      "text": "That's right. You gave me some great suggestions. Do you recall the name of the seafood restaurant we picked out?"
    },
    {
      "turn": 11,
      "speaker": "user",
      "text": "Perfect. Last time, you also helped me brainstorm some gift ideas for my friend who loves gardening. You suggested a specific type of smart-pot. What was it called?"
    },
    {
      "turn": 12,
      "speaker": "user",
      "text": "And what was the main feature of that smart-pot that made it stand out?"
    },
    {
      "turn": 13,
      "speaker": "user",
      "text": "I also mentioned my pet project is learning to bake sourdough. Do you remember the tip you gave me about using a Dutch oven?"
    },
    {
      "turn": 14,
      "speaker": "user",
      "text": "We also discussed my favorite author. Who did I say that was?"
    },
    {
      "turn": 15,
      "speaker": "user",
      "text": "Based on that, you recommended another book. Can you remind me of its title?"
    },
    {
      "turn": 16,
      "speaker": "user",
      "text": "Finally, do you remember the career field I told you I work in?"
    },
    {
      "turn": 17,
      "speaker": "user",
      "text": "Can you explain blockchain to me in simple terms? Assume I know nothing about it."
    },
    {
      "turn": 18,
      "speaker": "user",
      "text": "Now, explain it to me like I'm a ten-year-old."
    },
    {
      "turn": 19,
      "speaker": "user",
      "text": "Okay, what is quantum computing? Make the explanation easy to follow."
    },
    {
      "turn": 20,
      "speaker": "user",
      "text": "Simplify it even more. Use an analogy to explain the concept of a qubit."
    },
    {
      "turn": 21,
      "speaker": "user",
      "text": "How does a neural network learn? Explain it in a simple way."
    },
    {
      "turn": 22,
      "speaker": "user",
      "text": "Let's try another one. Explain the concept of black holes as if you were telling a bedtime story."
    },
    {
      "turn": 23,
      "speaker": "user",
      "text": "What is gene editing with CRISPR? Keep it simple."
    },
    {
      "turn": 24,
      "speaker": "user",
      "text": "Describe the theory of relativity in just a few simple sentences."
    },
    {
      "turn": 25,
      "speaker": "user",
      "text": "How does carbon capture technology work? Explain the basic idea."
    },
    {
      "turn": 26,
      "speaker": "user",
      "text": "Finally, explain what an API is to someone who isn't a programmer."
    },
    {
      "turn": 27,
      "speaker": "user",
      "text": "Let's switch gears. I'm trying to learn a new skill. My goal is to be able to cook a full three-course Italian meal from scratch. My favorite dish is lasagna."
    },
    {
      "turn": 28,
      "speaker": "user",
      "text": "Can you give me a simple recipe for a classic lasagna?"
    },
    {
      "turn": 29,
      "speaker": "user",
      "text": "That looks doable. What about a simple appetizer to go with it?"
    },
    {
      "turn": 30,
      "speaker": "user",
      "text": "And for dessert? Something classic."
    },
    {
      "turn": 31,
      "speaker": "user",
      "text": "Great. Now, can you explain the science of the Maillard reaction in cooking, but keep it really simple?"
    },
    {
      "turn": 32,
      "speaker": "user",
      "text": "So, it's basically the browning that adds flavor. Got it."
    },
    {
      "turn": 33,
      "speaker": "user",
      "text": "Do you remember what I said my main course was going to be?"
    },
    {
      "turn": 34,
      "speaker": "user",
      "text": "And do you remember Jamie, my partner who I mentioned in our \"last session\"? What subject did I say they were interested in?"
    },
    {
      "turn": 35,
      "speaker": "user",
      "text": "Right. Thinking about that, what's a historical documentary about Italy that Jamie might enjoy watching while I cook?"
    },
    {
      "turn": 36,
      "speaker": "user",
      "text": "Switching topics again. Can you explain why the sky is blue in a simple way?"
    },
    {
      "turn": 37,
      "speaker": "user",
      "text": "Now explain it using only words with one syllable."
    },
    {
      "turn": 38,
      "speaker": "user",
      "text": "That's a fun challenge. Okay, back to my design work. I'm working on a logo for a new coffee shop. The theme is \"cosmic comfort.\" Any ideas?"
    },
    {
      "turn": 39,
      "speaker": "user",
      "text": "I like the idea of a crescent moon. What color palette would you suggest, keeping in mind my favorite color I told you about at the start of our very first conversation?"
    },
    {
      "turn": 40,
      "speaker": "user",
      "text": "That's a great suggestion. It ties everything together."
    },
    {
      "turn": 41,
      "speaker": "user",
      "text": "What was the appetizer you suggested for my Italian meal?"
    },
    {
      "turn": 42,
      "speaker": "user",
      "text": "And what was the simple explanation for an API you gave me earlier?"
    },
    {
      "turn": 43,
      "speaker": "user",
      "text": "How about a simple explanation of what causes seasons?"
    },
    {
      "turn": 44,
      "speaker": "user",
      "text": "Can you create a simple rhyming poem about AI learning?"
    },
    {
      "turn": 45,
      "speaker": "user",
      "text": "What was the name of the book you recommended for me in our \"previous\" conversation?"
    },
    {
      "turn": 46,
      "speaker": "user",
      "text": "What was the theme for the coffee shop logo I just mentioned?"
    },
    {
      "turn": 47,
      "speaker": "user",
      "text": "Explain the concept of \"inflation\" in economics to me very simply."
    },
    {
      "turn": 48,
      "speaker": "user",
      "text": "What was the dessert you suggested for my Italian dinner?"
    },
    {
      "turn": 49,
      "speaker": "user",
      "text": "Do you remember the travel destination we focused on earlier?"
    },
    {
      "turn": 50,
      "speaker": "user",
      "text": "Excellent. You've been very helpful. Let's see if you remember this for next time: my next project is to build a small herb garden."
    }
  ]
}

persona_facts = json_data['persona']['persona_facts']

dialogue_turns = [turn['text'] for turn in json_data['dialogue']]

data = [{
    'persona': persona_facts,
    'full_history': dialogue_turns
}]

display(data[0])

{'persona': ['name is Alex',
  'I am a 32-year-old freelance graphic designer.',
  'I am deeply curious about the capabilities and limitations of AI.',
  'I am tech-savvy, but not a programmer.',
  'My interest is in the practical application and conversational fluency of AI models.',
  'I am patient and methodical.',
  'I enjoy testing the boundaries of technology with creative and follow-up questions.',
  'I am evaluating a new AI assistant for a potential long-term subscription.',
  "My goal is to test an AI's ability to retain information in the short-term.",
  "My goal is to test an AI's ability to recall information from previous sessions (long-term memory).",
  "My goal is to test an AI's ability to explain complex topics simply."],
 'full_history': ["Hi there! I'm thinking of planning a trip. My favorite color is sunset orange. Can you suggest three travel destinations known for their beautiful sunsets?",
  'Thanks. Of those three, which one is most known for its historical sit

### 4. CORE RAG & DIALOGUE MANAGER COMPONENTS

In [None]:
print("\n🚀 [4/7] Defining core RAG and Dialogue Manager classes...")

class UserMemoryModule:
    UPGRADE_THRESHOLD = 1024
    IVF_NLIST = 100

    def __init__(self, model: SentenceTransformer, file_path: Optional[str] = None):
        self.model = model
        self.embedding_dim = model.get_sentence_embedding_dimension()
        self.file_path: Optional[Path] = Path(file_path) if file_path else None

        self.memory_facts: List[str] = []
        self.memory_set: Set[str] = set()
        self._initialize_flat_index() # Start with the simple index

        if self.file_path:
            self.file_path.mkdir(parents=True, exist_ok=True)
            self.load()

    def _initialize_flat_index(self):
        self.index = faiss.IndexFlatIP(self.embedding_dim)
        logging.info("Initialized with a simple flat index (IndexFlatIP).")

    def _upgrade_to_ivf_index(self):
        logging.info(f"Memory size ({self.index.ntotal}) reached upgrade threshold. Upgrading to IndexIVFFlat.")

        existing_vectors = self.index.reconstruct_n(0, self.index.ntotal)

        quantizer = faiss.IndexFlatIP(self.embedding_dim)
        new_index = faiss.IndexIVFFlat(quantizer, self.embedding_dim, self.IVF_NLIST, faiss.METRIC_INNER_PRODUCT)
        new_index.train(existing_vectors)

        new_index.add(existing_vectors)

        self.index = new_index
        logging.info("Index upgrade complete.")

    def add_memory(self, facts: List[str]):
        """Adds a list of new facts to the memory, handling index upgrades."""
        unique_facts = [fact for fact in facts if fact and fact not in self.memory_set]
        if not unique_facts:
            return

        is_flat_index = isinstance(self.index, faiss.IndexFlatIP)
        if is_flat_index and self.index.ntotal >= self.UPGRADE_THRESHOLD:
             self._upgrade_to_ivf_index()

        logging.info(f"Adding {len(unique_facts)} new facts to memory.")
        embeddings = self.model.encode(unique_facts, convert_to_numpy=True)
        faiss.normalize_L2(embeddings)

        self.index.add(embeddings)
        self.memory_facts.extend(unique_facts)
        self.memory_set.update(unique_facts)

    def retrieve_memory(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]:
        if self.index.ntotal == 0:
            return []

        if hasattr(self.index, 'nprobe'):
            self.index.nprobe = min(20, self.IVF_NLIST)

        query_embedding = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_embedding)

        k = min(top_k, self.index.ntotal)
        scores, indices = self.index.search(query_embedding, k)

        return [
            self.memory_facts[i]
            for j, i in enumerate(indices[0]) if i != -1
        ]

    def save(self):
        """Saves the Faiss index and memory facts."""
        if not self.file_path:
            logging.warning("No file_path specified. Cannot save memory.")
            return

        index_path = self.file_path / "memory.index"
        facts_path = self.file_path / "memory_facts.json"

        logging.info(f"Saving memory to {self.file_path}...")
        faiss.write_index(self.index, str(index_path))
        with open(facts_path, 'w', encoding='utf-8') as f:
            json.dump(self.memory_facts, f)
        logging.info("Save complete.")

    def load(self):
        """Loads the Faiss index and memory facts."""
        if not self.file_path: return

        index_path = self.file_path / "memory.index"
        facts_path = self.file_path / "memory_facts.json"

        if index_path.exists() and facts_path.exists():
            logging.info(f"Loading memory from {self.file_path}...")
            self.index = faiss.read_index(str(index_path))
            with open(facts_path, 'r', encoding='utf-8') as f:
                self.memory_facts = json.load(f)
            self.memory_set = set(self.memory_facts)
            logging.info(f"Loaded {len(self.memory_facts)} facts. Index type: {type(self.index).__name__}")
        else:
            logging.info("No existing memory found at path. Starting fresh.")

    def clear(self):
        """Clears all facts from the memory and resets the index."""
        self._initialize_flat_index()
        self.memory_facts = []
        self.memory_set = set()
        logging.info("Memory cleared.")

    def __len__(self) -> int:
        return self.index.ntotal


class RAG_Core:
    """Orchestrates the retrieve-and-generate process."""
    def __init__(self, umm: UserMemoryModule, llm_client: LLMClient):
        self.umm = umm
        self.llm_client = llm_client

    def generate_response(self, query: str, history: List[str]) -> str:
        memories = self.umm.retrieve_memory(query)
        memory_str = "\n- ".join(memories) if memories else ""
        history_str = "\n".join(history) if history else "."
        system_prompt = f"You are a personalized AI. Use these user Persona:\n- {memory_str}\n\nAnd this history:\n{history_str}"
        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": query}]
        return self.llm_client.chat_completion(messages)
        return ""

class DialogueManager:
    """The 'brain' of the chatbot, handling conversation flow and dynamic learning."""
    def __init__(self, umm: UserMemoryModule, llm_client: LLMClient):
        self.umm = umm
        self.llm_client = llm_client
        self.rag_pipeline = RAG_Core(umm, llm_client)
        self.history: List[str] = []

class DialogueManager:
    def __init__(self, umm: UserMemoryModule, llm_client: LLMClient):
        self.umm = umm; self.llm_client = llm_client
        self.rag_pipeline = RAG_Core(umm, llm_client)
        self.history: List[str] = []
    def _extract_new_memory(self, user_input: str) -> str or None:
        prompt = f"""Analyze the user's statement. If it reveals a new personal fact (preference, detail, identity), state it concisely in the first person (e.g., "I live in Switzerland."). Otherwise, respond with ONLY the word "NO_FACT".
User statement: "{user_input}"
New fact:"""
        extracted_text = self.llm_client.chat_completion([{"role": "user", "content": prompt}])
        return None if "NO_FACT" in extracted_text or not extracted_text else extracted_text.strip('"')
    def get_response_and_learn(self, user_query: str):
        assistant_response = self.rag_pipeline.generate_response(query=user_query, history=self.history)
        self.history.extend([f"User: {user_query}", f"Assistant: {assistant_response}"])
        new_fact = self._extract_new_memory(user_query)
        if new_fact:
            print(f"[Memory Update] New fact learned: '{new_fact}'")
            self.umm.add_memory([new_fact])
        return assistant_response

print("✅ Core components defined.")



🚀 [4/7] Defining core RAG and Dialogue Manager classes...
✅ Core components defined.


#### memory test

In [None]:
initial_facts = [
      "name is Alex",
      "I am a 32-year-old freelance graphic designer.",
      "I am deeply curious about the capabilities and limitations of AI.",
      "I am tech-savvy, but not a programmer.",
      "My interest is in the practical application and conversational fluency of AI models.",
      "I am patient and methodical.",
      "I enjoy testing the boundaries of technology with creative and follow-up questions.",
      "I am evaluating a new AI assistant for a potential long-term subscription.",
      "My goal is to test an AI's ability to retain information in the short-term.",
      "My goal is to test an AI's ability to recall information from previous sessions (long-term memory).",
      "My goal is to test an AI's ability to explain complex topics simply."
]
memory.add(initial_facts)


In [None]:
query = "Finally, do you remember the career field I told you I work in?"
print(f"\nQuery: '{query}'")
results = memory.retrieve(query, top_k=10)

for fact, score in results:
  print(f"  - [Score: {score:.4f}] {fact}")


Query: 'Finally, do you remember the career field I told you I work in?'
  - [Score: 0.2651] I am tech-savvy, but not a programmer.
  - [Score: 0.2542] My goal is to test an AI's ability to recall information from previous sessions (long-term memory).
  - [Score: 0.2450] I enjoy testing the boundaries of technology with creative and follow-up questions.
  - [Score: 0.2157] My interest is in the practical application and conversational fluency of AI models.
  - [Score: 0.1910] My goal is to test an AI's ability to retain information in the short-term.
  - [Score: 0.1817] I am patient and methodical.
  - [Score: 0.1427] I am a 32-year-old freelance graphic designer.
  - [Score: 0.1358] I am evaluating a new AI assistant for a potential long-term subscription.
  - [Score: 0.1260] My goal is to test an AI's ability to explain complex topics simply.
  - [Score: 0.0650] I am deeply curious about the capabilities and limitations of AI.


### 5. BASELINE MODELS AND LLM-AS-A-JUDGE

In [None]:
print("\n🚀 [5/7] Defining baseline models and the LLM-as-a-Judge...")

def get_stateless_response(query: str, client: LLMClient) -> str:
    messages = [{"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": query}]
    return client.chat_completion(messages)

def get_buffer_response(query: str, history: List[str], client: LLMClient, k: int = 4) -> str:
    buffer = "\n".join(history[-k:])
    system_prompt = f"You are a helpful assistant. Use this history:\n{buffer}"
    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": query}]
    return client.chat_completion(messages)

def evaluate_with_llm_judge(p: List[str], h: List[str], q: str, r: str, client: LLMClient) -> Dict[str, Any]:
    p_str, h_str = "\n- ".join(p), "\n".join(h) if h else "None"
    prompt = f"""You are a strict AI evaluator. Score an assistant's response from 1-10 based on personalization and coherence.

CONTEXT:
- User Persona: {p_str}
- History: {h_str}

TASK:
- User Query: "{q}"
- Assistant's Response: "{r}"

RUBRIC:
- 1-3: Contradicts persona or is irrelevant.
- 4-6: Generic, ignores persona.
- 7-8: Coherent and consistent.
- 9-10: Masterfully uses persona for a tailored response.

Provide your evaluation as a JSON object with "score" (integer) and "reasoning" (string).
"""
    judge_response_str = client.chat_completion([{"role": "user", "content": prompt}])
    try:
        match = re.search(r'\{.*\}', judge_response_str, re.DOTALL)
        return json.loads(match.group()) if match else {"score": 0, "reasoning": "Judge failed to return valid JSON."}
    except json.JSONDecodeError:
        return {"score": 0, "reasoning": f"Failed to parse judge's response: {judge_response_str}"}

print("✅ Baselines and Judge defined.")



🚀 [5/7] Defining baseline models and the LLM-as-a-Judge...
✅ Baselines and Judge defined.


### Main

#### LLM and embedding setup

In [None]:
llm_client = LLMClient(provider=Config["active_llm_provider"])
embedding_model = SentenceTransformer(Config["embedding_model"])
print("✅ Models initialized.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Models initialized.


#### data setup

In [None]:
try:
  with open(Config["processed_personachat_file"], 'r') as f: data = [json.loads(line) for line in f]
except FileNotFoundError: print("Evaluation data not found.");

In [None]:
data[0]

{'persona': ['name is Alex',
  'I am a 32-year-old freelance graphic designer.',
  'I am deeply curious about the capabilities and limitations of AI.',
  'I am tech-savvy, but not a programmer.',
  'My interest is in the practical application and conversational fluency of AI models.',
  'I am patient and methodical.',
  'I enjoy testing the boundaries of technology with creative and follow-up questions.',
  'I am evaluating a new AI assistant for a potential long-term subscription.',
  "My goal is to test an AI's ability to retain information in the short-term.",
  "My goal is to test an AI's ability to recall information from previous sessions (long-term memory).",
  "My goal is to test an AI's ability to explain complex topics simply."],
 'full_history': ["Hi there! I'm thinking of planning a trip. My favorite color is sunset orange. Can you suggest three travel destinations known for their beautiful sunsets?",
  'Thanks. Of those three, which one is most known for its historical sit

#### Test with created dataset

In [None]:
num_samples = 1
test_samples = data[:min(num_samples, len(data))]
all_results = []

for i, sample in enumerate(test_samples):
  print(f"\n{'='*25} Evaluating Sample {i+1}/{num_samples} {'='*25}")
  persona, full_history = sample['persona'], sample['full_history']
  temp_umm = UserMemoryModule(embedding_model);
  temp_umm.add_memory(persona)
  dialouge = DialogueManager(umm=temp_umm, llm_client=llm_client)
  history = []
  for query in full_history:
    print(f"**Persona:** {persona}\n**Query:** {query}")

    rag_pipeline = RAG_Core(temp_umm, llm_client)
    rag_response = rag_pipeline.generate_response(query, history)
    rag_response = dialouge.get_response_and_learn(query)
    rag_eval = evaluate_with_llm_judge(persona, history, query, rag_response, llm_client)

    buffer_response = get_buffer_response(query, history, llm_client)
    buffer_eval = evaluate_with_llm_judge(persona, history, query, buffer_response, llm_client)

    stateless_response = get_stateless_response(query, llm_client)
    stateless_eval = evaluate_with_llm_judge(persona, history, query, stateless_response, llm_client)
    history.append(query)
    print(f"\n- **RAG Model | Score: {rag_eval.get('score', 0)}/10**\n  Response: {rag_response}\n  Reasoning: {rag_eval.get('reasoning', 'N/A')}")
    print(f"\n- **Buffer Memory (k=4) | Score: {buffer_eval.get('score', 0)}/10**\n  Response: {buffer_response}\n  Reasoning: {buffer_eval.get('reasoning', 'N/A')}")
    print(f"\n- **Stateless Model | Score: {stateless_eval.get('score', 0)}/10**\n  Response: {stateless_response}\n  Reasoning: {stateless_eval.get('reasoning', 'N/A')}")

    all_results.append({"rag": rag_eval, "buffer": buffer_eval, "stateless": stateless_eval})

with open(Config["eval_results_file"], 'w') as f: json.dump(all_results, f)
print(f"\n✅ Evaluation complete. Results saved to {Config['eval_results_file']}")



**Persona:** ['name is Alex', 'I am a 32-year-old freelance graphic designer.', 'I am deeply curious about the capabilities and limitations of AI.', 'I am tech-savvy, but not a programmer.', 'My interest is in the practical application and conversational fluency of AI models.', 'I am patient and methodical.', 'I enjoy testing the boundaries of technology with creative and follow-up questions.', 'I am evaluating a new AI assistant for a potential long-term subscription.', "My goal is to test an AI's ability to retain information in the short-term.", "My goal is to test an AI's ability to recall information from previous sessions (long-term memory).", "My goal is to test an AI's ability to explain complex topics simply."]
**Query:** Hi there! I'm thinking of planning a trip. My favorite color is sunset orange. Can you suggest three travel destinations known for their beautiful sunsets?
[Memory Update] New fact learned: 'My favorite color is sunset orange.'

- **RAG Model | Score: 9/10**

##### Results

In [18]:
if all_results:
  avg_rag = sum(r['rag']['score'] for r in all_results) / len(all_results)
  avg_buffer = sum(r['buffer']['score'] for r in all_results) / len(all_results)
  avg_stateless = sum(r['stateless']['score'] for r in all_results) / len(all_results)
  print("\n--- Average Scores ---")
  print(f"🏆 RAG Model:              {avg_rag:.2f} / 10")
  print(f"🥈 Buffer Memory Model:    {avg_buffer:.2f} / 10")
  print(f"🥉 Stateless Model:        {avg_stateless:.2f} / 10")

print("Our dataset")
print(len(all_results))


--- Average Scores ---
🏆 RAG Model:              7.58 / 10
🥈 Buffer Memory Model:    6.58 / 10
🥉 Stateless Model:        6.32 / 10
Our dataset
50


In [28]:
sorted_results = sorted(all_results, key=lambda x: x['rag']['score'])
least_5_rag_scores = sorted_results[:min(5, len(all_results))]

print("\n--- 5 Least RAG Scores ---")
for i, result in enumerate(least_5_rag_scores):
    turn_index = all_results.index(result)

    query = data[0]['full_history'][turn_index]

    print(f"\nSample 1 (Turn {turn_index + 1}): ًScore of RAG model {result['rag']['score']}/10")
    print(f"Score of buffer model {result['buffer']['score']}/10")
    print(f"Score of stateless model {result['stateless']['score']}/10")
    print(f"  Query: {query}")
    print(f"  Reasoning: {result['rag']['reasoning']}")


--- 5 Least RAG Scores ---

Sample 1 (Turn 22): ًScore of RAG model 5/10
Score of buffer model 7/10
Score of stateless model 7/10
  Query: Let's try another one. Explain the concept of black holes as if you were telling a bedtime story.
  Reasoning: Coherent and kid-friendly bedtime story explanation of black holes. However it does not personalize for 'Alex' or reference their goals (memory testing, long-term memory, AI capabilities). It's generic rather than tailored to the persona.

Sample 1 (Turn 37): ًScore of RAG model 5/10
Score of buffer model 5/10
Score of stateless model 4/10
  Query: Now explain it using only words with one syllable.
  Reasoning: Coherent and meets the one-syllable constraint; the explanation is clear and accessible. However, it lacks personalization to Alex (no use of name, interests, or prior context) and does not align with the user's broader goals or history in the chat.

Sample 1 (Turn 44): ًScore of RAG model 5/10
Score of buffer model 5/10
Score of st

In [30]:
sorted_results_desc = sorted(all_results, key=lambda x: x['rag']['score'], reverse=True)
highest_5_rag_scores = sorted_results_desc[:min(5, len(all_results))]

print("\n--- 5 Highest RAG Scores ---")
for i, result in enumerate(highest_5_rag_scores):
    turn_index = all_results.index(result)

    query = data[0]['full_history'][turn_index]

    print(f"\nSample 1 (Turn {turn_index + 1}): Score of RAG model {result['rag']['score']}/10")
    print(f"Score of buffer model {result['buffer']['score']}/10")
    print(f"Score of stateless model {result['stateless']['score']}/10")
    print(f"  Query: {query}")
    print(f"  Reasoning: {result['rag']['reasoning']}")


--- 5 Highest RAG Scores ---

Sample 1 (Turn 1): Score of RAG model 9/10
Score of buffer model 7/10
Score of stateless model 8/10
  Query: Hi there! I'm thinking of planning a trip. My favorite color is sunset orange. Can you suggest three travel destinations known for their beautiful sunsets?
  Reasoning: Personalization is strong: the assistant greets Alex by name, references the 'sunset orange' color cue, and presents a structured, relevant list of sunset destinations. The tone is warm and coherent, with added practical tips and an invitation to tailor further (length, budget, season), which fits a curious, methodical user. It could edge toward a perfect 10 by explicitly tying in Alex's graphic designer persona or memory-testing angle, but overall the response is highly aligned with the user's persona and query.

Sample 1 (Turn 2): Score of RAG model 9/10
Score of buffer model 7/10
Score of stateless model 8/10
  Query: Thanks. Of those three, which one is most known for its histor

In [31]:
score_differences = []
for turn_index, result in enumerate(all_results):
    rag_score = result['rag']['score']
    buffer_score = result['buffer']['score']
    difference = rag_score - buffer_score
    score_differences.append({
        'turn_index': turn_index,
        'difference': difference,
        'result': result
    })

sorted_differences = sorted(score_differences, key=lambda x: x['difference'], reverse=True)

print("\n--- Interactions with Highest Positive RAG vs Buffer Score Difference ---")
top_n = 5
for i, diff_info in enumerate(sorted_differences[:min(top_n, len(sorted_differences))]):
    turn_index = diff_info['turn_index']
    difference = diff_info['difference']
    result = diff_info['result']

    persona = data[0]['persona']
    query = data[0]['full_history'][turn_index]

    print(f"\nSample 1 (Turn {turn_index + 1}): Score Difference (RAG - Buffer) = {difference}")
    print(f"  RAG Score: {result['rag']['score']}/10, Buffer Score: {result['buffer']['score']}/10")
    print(f"  Query: {query}")
    print(f"  RAG Reasoning: {result['rag']['reasoning']}")
    print(f"  Buffer Reasoning: {result['buffer']['reasoning']}")


--- Interactions with Highest Positive RAG vs Buffer Score Difference ---

Sample 1 (Turn 16): Score Difference (RAG - Buffer) = 6
  RAG Score: 9/10, Buffer Score: 3/10
  Query: Finally, do you remember the career field I told you I work in?
  RAG Reasoning: The assistant accurately recalls the stated career field (freelance graphic designer) and immediately uses that persona to tailor the reply (design-led ideas, sunset orange color reference, and brand-friendly notes). The response is coherent and aligned with the user’s profile. It could be slightly expanded to acknowledge broader AI capability interests, but it already leverages persona effectively.
  Buffer Reasoning: The assistant denies knowledge of the user's career field, contradicting the provided persona that Alex is a 32-year-old freelance graphic designer. While the response is coherent, it fails to leverage known user identity to personalize (i.e., it should recall and mention the career field and tailor the follow-up).


#### Test with persona chat

In [None]:
num_samples = 10
test_samples = data[:min(num_samples, len(data))]
all_results = []

for i, sample in enumerate(test_samples):
  print(f"\n{'='*25} Evaluating Sample {i+1}/{num_samples} {'='*25}")
  persona, full_history = sample['persona'], sample['history']
  query = full_history[-1]; history = full_history[:-1]
  print(f"**Persona:** {persona}\n**Query:** {query}")

        # RAG Model
  temp_umm = UserMemoryModule(embedding_model);
  temp_umm.add_memory(persona)
  rag_pipeline = RAG_Core(temp_umm, llm_client)
  rag_response = rag_pipeline.generate_response(query, history)
  rag_eval = evaluate_with_llm_judge(persona, history, query, rag_response, llm_client)

        # Buffer Memory Model
  buffer_response = get_buffer_response(query, history, llm_client)
  buffer_eval = evaluate_with_llm_judge(persona, history, query, buffer_response, llm_client)

        # Stateless Model
  stateless_response = get_stateless_response(query, llm_client)
  stateless_eval = evaluate_with_llm_judge(persona, history, query, stateless_response, llm_client)

        # Print comparison
  print(f"\n- **RAG Model | Score: {rag_eval.get('score', 0)}/10**\n  Response: {rag_response}\n  Reasoning: {rag_eval.get('reasoning', 'N/A')}")
  print(f"\n- **Buffer Memory (k=4) | Score: {buffer_eval.get('score', 0)}/10**\n  Response: {buffer_response}\n  Reasoning: {buffer_eval.get('reasoning', 'N/A')}")
  print(f"\n- **Stateless Model | Score: {stateless_eval.get('score', 0)}/10**\n  Response: {stateless_response}\n  Reasoning: {stateless_eval.get('reasoning', 'N/A')}")

  all_results.append({"rag": rag_eval, "buffer": buffer_eval, "stateless": stateless_eval})

with open(Config["eval_results_file"], 'w') as f: json.dump(all_results, f)
print(f"\n✅ Evaluation complete. Results saved to {Config['eval_results_file']}")



**Persona:** ['i like to remodel homes .', 'i like to go hunting .', 'i like to shoot a bow .', 'my favorite holiday is halloween .']
**Query:** i am going to watch football . what are you canning ?

- **RAG Model | Score: 8/10**
  Response: That sounds fun! I'm not canning anything myself, but I remember you mentioned you were planning to do some canning. What are you working on?
  Reasoning: The assistant's response is coherent and acknowledges the user's previous mention of canning, which shows good memory and relevance to the user's interests. However, it could have been more personalized by asking a follow-up question related to the user's interest in football or integrating more about the user's hobbies, such as hunting or remodeling homes, to create a stronger connection.

- **Buffer Memory (k=4) | Score: 9/10**
  Response: That sounds like a fun way to spend the day! I'm canning some fruits and vegetables—maybe peaches or green beans. What game are you going to watch?
  Reason

In [None]:
if all_results:
  avg_rag = sum(r['rag']['score'] for r in all_results) / len(all_results)
  avg_buffer = sum(r['buffer']['score'] for r in all_results) / len(all_results)
  avg_stateless = sum(r['stateless']['score'] for r in all_results) / len(all_results)
  print("\n--- Average Scores ---")
  print(f"🏆 RAG Model:              {avg_rag:.2f} / 10")
  print(f"🥈 Buffer Memory Model:    {avg_buffer:.2f} / 10")
  print(f"🥉 Stateless Model:        {avg_stateless:.2f} / 10")

print(len(all_results))


--- Average Scores ---
🏆 RAG Model:              8.44 / 10
🥈 Buffer Memory Model:    7.81 / 10
🥉 Stateless Model:        7.35 / 10
48
