In [1]:
import chromadb
import polars as pl
from chromadb.config import Settings, DEFAULT_TENANT, DEFAULT_DATABASE
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import torch
import re

In [2]:
# --- Setup ChromaDB client ---
client = chromadb.PersistentClient(
    path="../chroma_db",
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

In [6]:
# --- Load models ---
desc_encoder = SentenceTransformer("../models/all-MiniLM-L6-v2")
reranker = CrossEncoder("../models/bge-reranker-base-crossencoder")
processor = AutoImageProcessor.from_pretrained("../models/dinov2-base")
viz_encoder = AutoModel.from_pretrained("../models/dinov2-base")

In [7]:
# --- Load metadata and critics ---
metadata = pl.read_ndjson('../data/mobygames_index.jsonl')
critics_emb = pl.read_ndjson('../embeddings/critics_embeddings.jsonl')

In [8]:
# --- Utility Functions ---
def clean_html(text):
    if hasattr(text, 'item'):
        text = text.item()
    return re.sub('<[^<]+?>', '', str(text))

def get_all_critic_quotes(game_id, critics_collection):
    results = critics_collection.get(
        where={"game_id": str(game_id)},
        include=[]
    )
    quotes = []
    for review_id in results["ids"]:
        critic_row = critics_emb.filter(pl.col("review_id") == int(review_id))
        if critic_row.height > 0:
            quotes.append(clean_html(critic_row[0]["review"]) if "review" in critic_row.columns else "")
    return quotes

In [9]:
# --- TEXT QUERY RETRIEVAL ---
user_query = "I want a game about furry animals."
query_embedding = desc_encoder.encode(user_query)

desc_collection = client.get_or_create_collection("desc_embeddings")
critics_collection = client.get_or_create_collection("critics_embeddings")

desc_results = desc_collection.query(
    query_embeddings=[query_embedding],
    n_results=50,
    include=["metadatas", "distances"]
)
critics_results = critics_collection.query(
    query_embeddings=[query_embedding],
    n_results=50,
    include=["metadatas", "distances"]
)

In [10]:
# Build candidates list with all critic quotes
candidates = []
game_ids = set()
for meta_list in [desc_results["metadatas"][0], critics_results["metadatas"][0]]:
    for meta in meta_list:
        game_id = int(meta["game_id"])
        if game_id not in game_ids:
            row = metadata.filter(pl.col("id") == game_id)
            if row.height > 0:
                row = row[0]
                critic_quotes = get_all_critic_quotes(game_id, critics_collection)
                critic_text = " | ".join(critic_quotes) if critic_quotes else ""
                candidates.append({
                    "title": clean_html(row["title"]),
                    "description": clean_html(row["description"]),
                    "critic": critic_text,
                })
                game_ids.add(game_id)

In [11]:
# Prepare (query, candidate) pairs for reranking
pairs = []
for c in candidates:
    desc = clean_html(c['description'])
    critic = clean_html(c['critic'])
    candidate_text = f"{c['title']}. {desc} Critics: {critic}"
    pairs.append([user_query, candidate_text])

In [12]:
# Compute reranker scores
scores = reranker.predict(pairs, batch_size=16)

In [13]:
# Rerank and print
reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
for i, (c, score) in enumerate(reranked, 1):
    print(f"{i}. {c['title']} (score: {score:.2f})")
    print(f"   Description: {c['description']}")
    print(f"   Critics: {c['critic']}\n")

1. Furry Woof (score: 0.79)
   Description: Furry Woof is a simple jigsaw puzzle game in which the player must assemble images of artwork depicting female "furries" (anthropomorphic animals) in various states of undress. As the name suggests, the furries depicted in this game are anthropomorphic dogs. The player must use the mouse to click and drag the jigsaw pieces onto their correct positions on the grid. The images range from being safe-for-work to suggestive to featuring full nudity. The game has a calm soundtrack that can be disabled by clicking on the music icon on the top left of the screen.
   Critics: 

2. Furry Woof and Nya (score: 0.55)
   Description: Furry Woof and Nya is a simple jigsaw puzzle game in which the player must assemble images of artwork depicting female "furries" (anthropomorphic animals) in various states of undress. The furries depicted in this game are anthropomorphic cats and dogs. The player must use the mouse to click and drag the jigsaw pieces onto the

In [14]:
# --- IMAGE QUERY RETRIEVAL AND RERANKING ---
user_image = "../data/sample.jpg"
image = Image.open(user_image).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
inputs = {k: v.to(viz_encoder.device) for k, v in inputs.items()}

# Get the embedding
with torch.no_grad():
    outputs = viz_encoder(**inputs)
    image_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

cover_collection = client.get_or_create_collection("cover_embeddings")
screenshot_collection = client.get_or_create_collection("screenshot_embeddings")

cover_results = cover_collection.query(
    query_embeddings=[image_embedding],
    n_results=50,
    include=["metadatas", "distances"]
)
screenshot_results = screenshot_collection.query(
    query_embeddings=[image_embedding],
    n_results=50,
    include=["metadatas", "distances"]
)

In [15]:
# Build candidates list for image query
candidates = []
game_ids = set()
for meta_list in [cover_results["metadatas"][0], screenshot_results["metadatas"][0]]:
    for meta in meta_list:
        game_id = int(meta["game_id"])
        if game_id not in game_ids:
            row = metadata.filter(pl.col("id") == game_id)
            if row.height > 0:
                row = row[0]
                critic_quotes = get_all_critic_quotes(game_id, critics_collection)
                critic_text = " | ".join(critic_quotes) if critic_quotes else ""
                candidates.append({
                    "title": clean_html(row["title"]),
                    "description": clean_html(row["description"]),
                    "critic": critic_text,
                })
                game_ids.add(game_id)

In [18]:
# Prepare (query, candidate) pairs for reranking (use a generic or user query)
cover_scores = {int(meta["game_id"]): 1 - dist for meta, dist in zip(cover_results["metadatas"][0], cover_results["distances"][0])}
screenshot_scores = {int(meta["game_id"]): 1 - dist for meta, dist in zip(screenshot_results["metadatas"][0], screenshot_results["distances"][0])}

for c in candidates:
    row = metadata.filter(pl.col("title") == c["title"])
    if row.height > 0:
        gid = row["id"].to_list()[0]
        c["score"] = cover_scores.get(gid, 0) + screenshot_scores.get(gid, 0)
    else:
        c["score"] = 0

reranked = sorted(candidates, key=lambda x: x["score"], reverse=True)
for i, c in enumerate(reranked, 1):
    print(f"{i}. {c['title']} (score: {c['score']:.2f})")
    print(f"   Description: {c['description']}")
    print(f"   Critics: {c['critic']}\n")

1. Final Fantasy (score: 0.00)
   Description: This release of Final Fantasy is built upon the Game Boy Advance version of the game. It retains the bonus dungeons, bestiary and unlockable BGM music player from said version, as well as the CG intro from the Playstation version. It also features improved graphics done in 16:9 format with larger fonts and higher quality of music and sound effects.
There is also an entirely new dungeon dubbed "The Labyrinth of Time" which enforces a time limit on the player, as well as an unlockable Yoshitaka Amano art gallery in this version.
   Critics:  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  | 

2. Skullgirls (score: 0.00)
   Description: None
   Critics: 

3. Guardians (score: 0.00)
   Description: Guardians or Denjinmakai II is a 2D side-scrolling beat 'em-up and sequel to Denjinmakai taking place in a futuristic setting and eight playable character to choose from. Returning characters are: wrestler Tarukusu (now named Tu