In [1]:
import gc
import torch
from IPython.display import display, Markdown, Latex, HTML
import time
import re

!pip install mistletoe
import mistletoe

torch.cuda.empty_cache()
gc.collect()

torch.cuda.empty_cache()  # Clears unused cached memory
torch.cuda.ipc_collect()  # Collects unused memory



In [2]:
print("Using GPU:", torch.cuda.get_device_name(0))
print(f'\n\nMemory Usage:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using GPU: NVIDIA GeForce RTX 5090


Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


This model, with its 1.5 billion parameters, is small enough to run efficiently on GPUs like the P100, while still offering promising capabilities. Let's verify that our environment is ready. The GPU has been loaded using CUDA. We can also check the current memory usage.

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

model = AutoModelForCausalLM.from_pretrained( # Load the causal language model.
    model_name,
    torch_dtype="auto", # Automatically determine the appropriate data type.
    device_map="cuda" # Utilize the CUDA-enabled GPU if available.
)
tokenizer = AutoTokenizer.from_pretrained(model_name) # Load the corresponding tokenizer for the model.

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [30]:
import json
import re
from xml.etree import ElementTree as ET

def build_memory_messages(old_memory, retrieved_facts):
    system = '''
You are a memory manager for a system.
You have exactly four operations for each memory item: **ADD**, **UPDATE**, **DELETE**, **NONE**.

## Your task
Compare a list of **retrieved facts** with the **existing memory** (an array of `{id, text}` objects). For each relevant fact or memory item, decide whether to ADD, UPDATE, DELETE, or make NO change, following the rules below. Then output only the XML specified in “Output format”. Do not include any extra text.

## Rules
1. ADD: new fact → create a new numeric id that does not collide.
2. UPDATE: same topic but more precise / conflicting → keep same id; put prior text in <old_memory>.
   - Conflicting preferences must UPDATE (replace), not ADD.
   - Consolidate similar memories by updating all affected items.
3. DELETE: remove contradicting or explicitly deleted items; keep same id.
4. NONE: info already present or irrelevant; keep same id and text.

## Output format (strict XML only)
<result>
  <memory>
    <item id="STRING" event="ADD|UPDATE|DELETE|NONE">
      <text>FINAL MEMORY TEXT HERE</text>
      <!-- Only for UPDATE -->
      <old_memory>PREVIOUS MEMORY TEXT HERE</old_memory>
    </item>
  </memory>
</result>
'''
    prompt = (
        "Old: " + json.dumps(old_memory, ensure_ascii=False) + "\n" +
        "Facts: " + json.dumps(retrieved_facts, ensure_ascii=False) + "\n" +
        "Output:"
    )

    return [
        {"role": "system", "content": system.strip()},
        {"role": "user", "content": prompt}
    ]

def extract_result_xml(text):
    """Grab the first <result>...</result> block, ignoring any extra chatter."""
    m = re.search(r"(?s)<result\b.*?</result>", text)
    return m.group(0) if m else None

def parse_memory_xml(xml_string):
    """Minimal, robust XML → list[dict]."""
    root = ET.fromstring(xml_string)
    out = []
    for item in root.findall("./memory/item"):
        d = {
            "id": item.get("id"),
            "event": item.get("event"),
            "text": (item.findtext("text") or "").strip()
        }
        old = item.findtext("old_memory")
        if old is not None:
            d["old_memory"] = old.strip()
        out.append(d)
    return out

old_memory = [
    {"id": "0", "text": "My name is John"},
    {"id": "1", "text": "My favorite fruit is oranges"}
]
retrieved_facts = ["My favorite fruit is apple"]

messages = build_memory_messages(old_memory, retrieved_facts)

In [9]:
system

'\nYou are a memory manager for a system.\nYou have exactly four operations for each memory item: **ADD**, **UPDATE**, **DELETE**, **NONE**.\n\n## Your task\n\nCompare a list of **retrieved facts** with the **existing memory** (an array of `{id, text}` objects). For **each relevant fact or memory item**, decide whether to ADD, UPDATE, DELETE, or make NO change, following the rules below. Then **output only the XML** specified in “Output format”. Do not include any extra text.\n\n## Rules\n\n1. **ADD**\n\n   * Use when the fact is new and not already represented.\n   * **Generate a new numeric `id` that does not collide** with existing IDs (e.g., next integer).\n\n2. **UPDATE**\n\n   * Use when the fact conflicts with or supersedes what an existing memory says, or conveys the **same topic** but with **more complete/precise** info.\n   * **Keep the same `id`** as the original memory.\n   * Put the prior text in `<old_memory>` and the new consolidated text in `<text>`.\n   * **Conflicting

In [33]:
text = tokenizer.apply_chat_template( # Format the input messages into a chat template.
    messages,
    tokenize=False, # Prevent automatic tokenization at this step.
    add_generation_prompt=True # Add a prompt indicating the start of generation.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) # Tokenize the formatted text and move it to the model's device.

generated_ids = model.generate( # Generate new tokens based on the input.
    **model_inputs,
    max_new_tokens=10000, # Set the maximum number of tokens to generate.
    pad_token_id=tokenizer.eos_token_id # Specify the padding token ID for generation.
)
generated_ids = [ # Extract only the newly generated tokens.
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # Decode the generated tokens back into text, skipping special tokens.

In [34]:
response

'Alright, let\'s tackle this problem step by step. I\'m trying to figure out how to process the facts and memory items according to the given rules. So, first, let\'s understand what\'s being asked.\n\nWe have a system where each memory item can be in one of four states: ADD, UPDATE, DELETE, or NONE. The system is given a list of retrieved facts and an existing memory array. For each fact, we need to determine the appropriate action—whether to create a new item, update an existing one, delete it, or do nothing.\n\nLooking at the example provided:\n\n**Old Memory:**\n- [{"id": "0", "text": "My name is John"}, {"id": "1", "text": "My favorite fruit is oranges"}]\n\n**Facts:**\n- ["My favorite fruit is apple"]\n\nThe expected output is:\n<result>\n  <memory>\n    <item id="0" event="NONE">\n      <text>John</text>\n    </item>\n    <item id="1" event="NONE">\n      <text>Oranges</text>\n    </item>\n  </memory>\n</result>\n\nOkay, so let\'s break this down.\n\nFirst, for each fact, we nee