In [1]:
import chromadb
import polars as pl
from chromadb.config import Settings, DEFAULT_TENANT, DEFAULT_DATABASE
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import torch
import re

In [2]:
# --- Setup ChromaDB client ---
client = chromadb.PersistentClient(
    path="../chroma_db",
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

In [3]:
# --- Load models ---
desc_encoder = SentenceTransformer("../models/all-MiniLM-L6-v2")
reranker = CrossEncoder("../models/bge-reranker-base")
processor = AutoImageProcessor.from_pretrained("../models/dinov2-base")
viz_encoder = AutoModel.from_pretrained("../models/dinov2-base")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../models/bge-reranker-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
# --- Load metadata and critics ---
metadata = pl.read_ndjson('../data/mobygames_index.jsonl')
critics_emb = pl.read_ndjson('../embeddings/critics_embeddings.jsonl')

In [5]:
# --- Utility Functions ---
def clean_html(text):
    if hasattr(text, 'item'):
        text = text.item()
    return re.sub('<[^<]+?>', '', str(text))

def get_all_critic_quotes(game_id, critics_collection):
    results = critics_collection.get(
        where={"game_id": str(game_id)},
        include=[]
    )
    quotes = []
    for review_id in results["ids"]:
        critic_row = critics_emb.filter(pl.col("review_id") == int(review_id))
        if critic_row.height > 0:
            quotes.append(clean_html(critic_row[0]["review"]) if "review" in critic_row.columns else "")
    return quotes

In [11]:
# --- TEXT QUERY RETRIEVAL ---
user_query = "I want a game about furry animals."
query_embedding = desc_encoder.encode(user_query)

desc_collection = client.get_or_create_collection("desc_embeddings")
critics_collection = client.get_or_create_collection("critics_embeddings")

desc_results = desc_collection.query(
    query_embeddings=[query_embedding],
    n_results=100,
    include=["metadatas", "distances"]
)

In [12]:
# Build candidates list with all critic quotes
candidates = []
for meta in desc_results["metadatas"][0]:
    game_id = int(meta["game_id"])
    row = metadata.filter(pl.col("id") == game_id)
    if row.height > 0:
        row = row[0]
        critic_quotes = get_all_critic_quotes(game_id, critics_collection)
        critic_text = " | ".join(critic_quotes) if critic_quotes else ""
        candidates.append({
            "title": clean_html(row["title"]),
            "description": clean_html(row["description"]),
            "critic": critic_text,
        })

In [13]:
# Prepare (query, candidate) pairs for reranking
pairs = []
for c in candidates:
    desc = clean_html(c['description'])
    critic = clean_html(c['critic'])
    candidate_text = f"{c['title']}. {desc} Critics: {critic}"
    pairs.append([user_query, candidate_text])

In [14]:
# Compute reranker scores
scores = reranker.predict(pairs, batch_size=16)

In [15]:
# Rerank and print
reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
print("Reranked candidates (text query):")
for i, (c, score) in enumerate(reranked, 1):
    print(f"{i}. {c['title']} (score: {score:.2f})")
    print(f"   Description: {c['description']}")
    print(f"   Critics: {c['critic']}\n")

Reranked candidates (text query):
1. Pet Vet (score: 0.51)
   Description: Pet Vet is a single player simulation of a veterinary practice.
In this game the player treats a variety of animals with afflictions ranging from a minor scrape to something life threatening and requiring surgery. To do this they follow a process which begins with talking to the owner and examining the patient through to diagnosis, treatment and in some cases surgery.
On the player’s desk there is a PC which contains three mini games: a Concentration game, a jigsaw game and an animal-themed Trivia Quiz. On the same in-game PC there is a game guide, details of the tools used and an index of animals.
There is no text dialogue in this game; everything is spoken aloud, so speakers are needed to play it.
   Critics: 

2. Amazing Animals: Activity Center (score: 0.51)
   Description: Amazing Animals: Activity Center is an educational game based on the TV show Amazing Animals. It features five mini games:

Copy Cat is 

In [16]:
# --- IMAGE QUERY RETRIEVAL AND RERANKING ---
user_image = "../data/sample.jpg"
image = Image.open(user_image).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
inputs = {k: v.to(viz_encoder.device) for k, v in inputs.items()}

# Get the embedding
with torch.no_grad():
    outputs = viz_encoder(**inputs)
    image_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

cover_collection = client.get_or_create_collection("cover_embeddings")
cover_results = cover_collection.query(
    query_embeddings=[image_embedding],
    n_results=100,
    include=["metadatas", "distances"]
)

In [17]:
# Build candidates list for image query
candidates = []
for meta in cover_results["metadatas"][0]:
    game_id = int(meta["game_id"])
    row = metadata.filter(pl.col("id") == game_id)
    if row.height > 0:
        row = row[0]
        critic_quotes = get_all_critic_quotes(game_id, critics_collection)
        critic_text = " | ".join(critic_quotes) if critic_quotes else ""
        candidates.append({
            "title": clean_html(row["title"]),
            "description": clean_html(row["description"]),
            "critic": critic_text,
        })

In [18]:
# Prepare (query, candidate) pairs for reranking (use a generic or user query)
pairs = []
for c in candidates:
    desc = clean_html(c['description'])
    critic = clean_html(c['critic'])
    candidate_text = f"{c['title']}. {desc} Critics: {critic}"
    pairs.append([user_query, candidate_text])

scores = reranker.predict(pairs, batch_size=16)

reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
print("Reranked candidates (by cover similarity, then text):")
for i, (c, score) in enumerate(reranked, 1):
    print(f"{i}. {c['title']} (score: {score:.2f})")
    print(f"   Description: {c['description']}")
    print(f"   Critics: {c['critic']}\n")

Reranked candidates (by cover similarity, then text):
1. Creatures (score: 0.51)
   Description: The inhabitants of the pretty but insignificant planet Blot decided to leave to find a trendier planet, and escape being called the Blotians, a most unhip name.  Until they such a planet, they call themselves the Fuzzy Wuzzies.
Their colony ship collides with an asteroid, and crash lands in the Pacific Ocean, near an unchartered island, which they colonise, calling it "The Hippest Place in The Known
Universe".  Unfortunately the island is also home to rather nasty demons, who can't stand the commotion caused by the happy Fuzzies, or their silly name for the island.
The demons kidnap the Fuzzy Wuzzies by hosting a massive party, then capturing them in a net, and dragging them off to the torture chambers.  The only Fuzzy not caught is Clyde Radcliff, who was already being sick in the bushes.  He wakes up in the morning with a really bad headache, and fiery halitosis, and sets out to destroy t