# 3. Feature Creation Example

Now that we have created a dataset with a projection, we can show the data in an UI. In addition we will add features such as vector embeddings in this part for retrieval.

In [1]:
from rts.db.dao import DataAccessObject
from rts.db_settings import DATABASE_URL
from rts.db.queries import get_library_id_from_name, get_all_media_by_library_id, create_feature, get_nearest_neighbors, update_media, get_all_media_by_metadata, get_media_by_feature_value
from rts.api.models import Feature, Media
import os
import random
from dotenv import load_dotenv

load_dotenv()
DataAccessObject().connect(DATABASE_URL)

ARCHIVE_BASE_PATH = os.getenv("BASE_PATH")

In [2]:
archive_name =  "rts"
bucket_name =  archive_name
library_id =  get_library_id_from_name(archive_name)

In [3]:
# Example 1: we compute word embeddings of the content in the media files to find similar media files

# 1. Create mock word embeddings for each media file
media = get_all_media_by_library_id(library_id, 'video', 'clip')
print(f"Found {len(media)} media files for library {library_id}")

embeddings = []
for m in media:
    embeddings.append({
        "media_id": m.media_id,
        "embedding": [random.random() for i in range(0, 1024)]
    })

Found 21 media files for library 1


In [4]:
# 2. Create an entry in the feature table for each media file

for e in embeddings:
    feature = Feature(
        feature_type='embedding',
        version=1,
        model_name='bert',
        model_params={},
        data={},
        media_id=e['media_id'], 
        embedding_size=1024,
        embedding_1024=e['embedding']
    )
    create_feature(feature)

In [5]:
# pick a feature at random
feature = random.choice(embeddings)

# Find similar media files by computing the cosine similarity between the embeddings
# and return the top 10 results
r = get_nearest_neighbors(feature['media_id'], 'embedding', 'bert', 1, 5)
r

[{'media_id': 'ZB012020-L001', 'distance': 12.755394886691601},
 {'media_id': 'ZB008020-L001', 'distance': 12.821608079804582},
 {'media_id': 'ZB006020-L000', 'distance': 12.844895734497763},
 {'media_id': 'ZB008020-L000', 'distance': 12.912064428714846},
 {'media_id': 'ZB003020-L000', 'distance': 12.916703534537762}]

In [6]:
# Media file can have features, but they can also have metadata themselves (we use the name features here when we created it ourselves from the data)

# Update video metadata to have the cities
cities = ['Lausanne', 'Geneva', 'Zurich', 'Bern', 'Basel', 'Lugano']

for m in media:
    mo = Media(**m)
    mo.metadata['city'] = random.choice(cities)
    update_media(m['media_id'], mo)

In [7]:
print(f"There are {len(get_all_media_by_metadata('city', 'Lausanne'))} media files from Lausanne")

There are 2 media files from Lausanne


In [8]:
# Full text search over features (like transcriptions)

# # 1. Create mock transcriptions for each media file
# media = get_all_media_by_library_id(library_id, 'video', 'clip')
# print(f"Found {len(media)} media files for library {library_id}")

example_transcriptions = [
    "We are currently in CERN this week",
    "We are currently in EPFL next Friday",
    "We are currently in ETHZ as you can see",
]

transcriptions = []
for m in media:
    transcriptions.append({
        "media_id": m.media_id,
        "transcription": random.choice(example_transcriptions)
    })

In [9]:
# 2. Create an entry in the feature table for each media file

for e in transcriptions:
    feature = Feature(
        feature_type='transcription',
        version=1,
        model_name='whisper',
        model_params={},
        data={'transcription': e['transcription']},
        media_id=e['media_id'], 
    )
    create_feature(feature)

In [10]:
print(f"There are {len(get_media_by_feature_value('transcription', 'CERN'))} media files with the word CERN in the transcription")

There are 6 media files with the word CERN in the transcription
