## Task 2: Multi-Modality Similarity Search

The **second task** we define for our **Game Recommendation Assistant** is the following:

**Text-based retrieval**: Given a game description or textual query such as “open-world fantasy adventure”, the system retrieves game covers (images), trailers (videos), and descriptions (texts) of games with similar themes, genres, or narrative elements.

**Image-based retrieval**: Given a game cover or in-game snapshot, the system retrieves visually and semantically related games, including similar covers (images), trailers (videos), and descriptions (texts) that share comparable art styles, visual motifs, or atmosphere.

**Video-based retrieval**: Given a game trailer, the system retrieves trailers (videos), covers (images), and descriptions (texts) of games with a similar visual tone, gameplay style, or mood.

In [1]:
# Importing useful dependencies
import io
import boto3
import torch
import chromadb
import open_clip
import numpy as np
from PIL import Image
from io import BytesIO
import ipywidgets as widgets
from IPython.display import display

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [3]:
# Connect to the server of ChromaDB where we stored the embeddings of files (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)

# Create or get the collection named "texts_images"
collection_texts_images = client.create_collection(name="texts_images", get_or_create=True, embedding_function=None)

In [4]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

Let's define the models we will be using.

In [8]:
# Load "ViT-B-16" model for images and texts
model_it, _, preprocess_it = open_clip.create_model_and_transforms("ViT-B-16", pretrained="openai")
tokenizer_it = open_clip.get_tokenizer("ViT-B-16") # Tokenizer for texts
model_it.to(device)

# +++



CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In the following cells we implement some functions to get embeddings of a given data and get files from MinIO.

In [6]:
# We can use this function to retrieve a text from our bucket
def get_text(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    body = response["Body"].read().decode("utf-8")
    return body

# We can use this function to retrieve an image from our bucket in PIL Image format
def get_image(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    img = Image.open(io.BytesIO(body))
    return img

In [11]:
@torch.no_grad()
# The next function returns the embedding of the given text
def embed_text(preprocess, model, tokenizer, texts: str):
    tokens = tokenizer([texts]).to(device) # tokenized batch
    feats = model.encode_text(tokens)
    feats = feats / feats.norm(dim=-1, keepdim=True) # normalize
    return feats.cpu().numpy()[0]

# The next function returns the embedding of the given PIL Image
def embed_image(preprocess, model, pil_img):
    img_tensor = preprocess(pil_img).unsqueeze(0).to(device)
    with torch.no_grad():
        feats = model.encode_image(img_tensor)
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu().numpy().squeeze()

Here below, we implement some functions to retrieve similar multi-modality data.

In [9]:
def print_top_k_images_texts(res, k=5):
    # Print results with type (text/image)
    n_text = 0
    n_image = 0
    i = 0
    for _, doc in enumerate(res["documents"][0]):
        if (doc.split(".")[-1] == "txt" and n_text < k):
            print(f"{i+1}. Distance: {res['distances'][0][i]:.4f}")
            print("Content:", doc)
            print(get_text("trusted-zone", doc.replace("trusted-zone/", "", 1)))
            print("-" * 40)
            i += 1
            n_text += 1
        elif (doc.split(".")[-1] == "png" and n_image < k):
            print(f"{i+1}. Distance: {res['distances'][0][i]:.4f}")
            print("Content:", doc)
            display(get_image("trusted-zone", doc.replace("trusted-zone/", "", 1)))
            print("-" * 40)
            i += 1
            n_image += 1
        # Stop early if both top-k limits are reached
        if n_text >= k and n_image >= k:
            break

**Text-based retrieval**

In [None]:
# Example: query by another game's description
query_text = "Games similar to Nier: Automata"
q_vec = embed_text(preprocess_it, model_it, tokenizer_it, query_text).tolist()

res = collection_texts_images.query(
    query_embeddings=[q_vec],
    # It’s expected that all nearest neighbors are text for a long text query.
    # To get images, we need to retrieve more embeddings.
    n_results=1000,
    include=["documents","distances"]
)

print_top_k_images_texts(res, k = 5)

**Image-based retrieval**

In [14]:
# Upload an image from local storage
uploader = widgets.FileUpload(accept='image/*', multiple=False)
display(uploader)

FileUpload(value=(), accept='image/*', description='Upload')

In [19]:
# Extract the uploaded file
image_data = uploader.value[0].content
img_example = Image.open(BytesIO(image_data))

# Create embeddings for the Image
img_example_emb = embed_image(preprocess_it, model_it, img_example)
img_example

In [18]:
# Example: query by another game's cover or snapshot
q_vec = embed_image(preprocess_it, model_it, img_example)

res = collection_texts_images.query(
    query_embeddings=[q_vec],
    n_results=1000,
    include=["documents","distances"]
)

print_top_k_images_texts(res, k = 5)

**Video-based retrieval**