# Database Examples 

Examples for the new database schema and how to use it.

In [1]:
! pip install psycopg2-binary python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# jupyter notebook auto reload
%cd ..
%load_ext autoreload
%autoreload 2

/home/andre/rts


## Setup

Utility functions for setting up the database.

In [3]:
import os
import psycopg2
from dotenv import load_dotenv
from emv.db.utils import execute_read_query, execute_write_query, write_media_object_db

load_dotenv()

True

## Tables

Creating the necessary database tables

In [21]:
execute_write_query("DROP TABLE IF EXISTS map_projection_feature;")
execute_write_query("DROP TABLE IF EXISTS projection;")
execute_write_query("DROP TABLE IF EXISTS atlas;")
execute_write_query("DROP TABLE IF EXISTS feature;")
execute_write_query("DROP TABLE IF EXISTS media;")
execute_write_query("DROP TABLE IF EXISTS library;")

In [22]:
# postgres table definitions

_table_library = """
    CREATE TABLE IF NOT EXISTS library (
        library_id SERIAL PRIMARY KEY,
        library_name VARCHAR(50) NOT NULL,
        version VARCHAR(20) NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        data JSONB NOT NULL
    );
"""

_create_library = """
    INSERT INTO library (library_name, version, data)
    VALUES ('rts', '0.1', '{}')
    RETURNING library_id;
"""

_table_projection = """
    CREATE TABLE IF NOT EXISTS projection (
        projection_id SERIAL PRIMARY KEY,
        version VARCHAR(20) NOT NULL,
        library_id INTEGER NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        model_name VARCHAR(200) NOT NULL,
        model_params JSONB NOT NULL,
        data JSONB NOT NULL,
        dimension INTEGER NOT NULL,
        atlas_folder_path VARCHAR(500) NOT NULL,
        atlas_width INTEGER NOT NULL,
        tile_size INTEGER NOT NULL,
        atlas_count INTEGER NOT NULL,
        total_tiles INTEGER NOT NULL,
        tiles_per_atlas INTEGER NOT NULL,

        CONSTRAINT FK_projection_library_id FOREIGN KEY (library_id)
            REFERENCES library (library_id)
    );
"""

_table_atlas = """
    CREATE TABLE IF NOT EXISTS atlas (
        atlas_id SERIAL PRIMARY KEY,
        projection_id INTEGER NOT NULL,
        atlas_order INTEGER NOT NULL,
        atlas_path VARCHAR(500) NOT NULL,
        atlas_size Vector (2) NOT NULL,
        tile_size Vector (2) NOT NULL,
        tile_count INTEGER NOT NULL,
        rows INTEGER NOT NULL,
        cols INTEGER NOT NULL,
        tiles_per_atlas INTEGER NOT NULL,

        CONSTRAINT FK_atlas_projection_id FOREIGN KEY (projection_id)
            REFERENCES projection (projection_id)
    );
"""

_table_media = """
    CREATE TABLE IF NOT EXISTS media (
        media_id SERIAL PRIMARY KEY,
        media_path VARCHAR(500) UNIQUE,
        original_path VARCHAR(500) NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        media_type VARCHAR(50) NOT NULL,
        sub_type VARCHAR(50) NOT NULL,
        size INTEGER NOT NULL,
        metadata JSONB NOT NULL,
        library_id INTEGER NOT NULL,
        hash VARCHAR(50) UNIQUE,
        parent_id INTEGER,
        start_ts FLOAT,
        end_ts FLOAT,
        start_frame INTEGER,
        end_frame INTEGER,
        frame_rate FLOAT,

        CONSTRAINT FK_media_library_id FOREIGN KEY (library_id)
            REFERENCES library (library_id)
    );
"""

_table_features = """
    CREATE TABLE IF NOT EXISTS feature (
        feature_id SERIAL PRIMARY KEY,
        feature_type VARCHAR(50) NOT NULL,
        version VARCHAR(20) NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        model_name VARCHAR(200) NOT NULL,
        model_params JSONB NOT NULL,
        data JSONB NOT NULL,

        embedding_size INTEGER,
        embedding_1024 vector (1024),
        embedding_1536 vector (1536),
        embedding_2048 vector (2048),

        media_id INTEGER,

        CONSTRAINT FK_feature_media_id FOREIGN KEY (media_id) 
            REFERENCES media (media_id)
    );
"""

_table_map_projection_feature = """
    CREATE TABLE IF NOT EXISTS map_projection_feature (
        map_projection_feature_id SERIAL PRIMARY KEY,
        projection_id INTEGER NOT NULL,
        feature_id INTEGER,
        media_id INTEGER,
        atlas_order INTEGER NOT NULL,

        CONSTRAINT FK_map_projection_feature_projection_id FOREIGN KEY (projection_id)
            REFERENCES projection (projection_id),
        CONSTRAINT FK_map_projection_feature_feature_id FOREIGN KEY (feature_id)
            REFERENCES feature (feature_id),
        CONSTRAINT FK_map_projection_feature_media_id FOREIGN KEY (media_id)
            REFERENCES media (media_id)
    );
"""

execute_write_query(_table_library)
execute_write_query(_table_projection)
execute_write_query(_table_media)
execute_write_query(_table_features)
execute_write_query(_table_map_projection_feature)

print(execute_read_query(_create_library))

[(1,)]


## Fill tables with sample data

In [23]:
# create sample data for the tables, each media element can have multiple feature

swiss_cities = ["Zurich", "Geneva", "Basel", "Lausanne", "Bern", "Winterthur", "Lucerne", "St. Gallen", "Lugano", "Biel/Bienne"]
years = [2015, 2016, 2017, 2018, 2019, 2020]

for i in range(1, 21):
    _table_media_sample = f"""
        INSERT INTO media (media_path, original_path, media_type, sub_type, size, metadata, library_id)
        VALUES 
            ('/path/to/media{i}', '/path/to/original{i}', 'image', 'jpg', 500, '{{"key{i}": "value{i}", "city": "{swiss_cities[(i-1) % len(swiss_cities)]}", "year": {years[(i-1) % len(years)]}}}', 1)
        RETURNING media_id;
    """
    execute_write_query(_table_media_sample)


In [25]:
_query = """SELECT media_id FROM media;"""
execute_read_query(_query)

[(1,),
 (2,),
 (3,),
 (4,),
 (5,),
 (6,),
 (7,),
 (8,),
 (9,),
 (10,),
 (11,),
 (12,),
 (13,),
 (14,),
 (15,),
 (16,),
 (17,),
 (18,),
 (19,),
 (20,)]

In [26]:
# insert some samples with vectors
import numpy as np
import json

feature_types = ["pose", "face", "object", "ner"]
color_meta = ["red", "green", "blue", "yellow", "black", "white", "grey", "orange", "purple", "pink"]

for i in range(1, 11):
    vector_1024 = np.random.rand(1024).tolist()
    vector_2048 = np.random.rand(2048).tolist()
    ner_tags = json.dumps([("person", "Ueli Steck", 1, 9), ("city", "geneva", 10, 16), ("city", "zurich", 17, 23), ("city", "bern", 24, 28), ("city", "basel", 29, 34), ("city", "winterthur", 35, 46), ("city", "lucerne", 47, 54), ("city", "st. gallen", 55, 65), ("city", "lugano", 66, 72), ("city", "biel/bienne", 73, 85)])

    _table_features_sample_vectors = f"""
        INSERT INTO feature (feature_type, version, model_name, model_params, data, media_id, embedding_size, embedding_1024)
        VALUES
            ('{feature_types[i % len(feature_types)]}', 'v1', 'resnet50', '{{"param1": "{i}"}}', '{{"color": "{color_meta[i % len(color_meta)] }", "data1": "{i}", "ner": {ner_tags} }}', {i}, 1024, ARRAY[{','.join([str(x) for x in vector_1024])}])
        RETURNING feature_id;
    """
    # print(_table_features_sample_vectors)
    execute_write_query(_table_features_sample_vectors)

    # At the moment we are only creating size 1024 vectors, for the sake of the next example queries to work, there can be only a single vector set per feature
    # _table_features_sample_vectors = f"""
    #     INSERT INTO features (feature_type, version, model_name, model_params, data, media_id, embedding_size, embedding_2048)
    #     VALUES
    #         ('image', 'v1', 'resnet50', '{{"param1": "{i}"}}', '{{"data1": "{i}"}}', {i}, 2048, ARRAY[{','.join([str(x) for x in vector_2048])}])
    #     RETURNING feature_id;
    # """

    # execute_write_query(_table_features_sample_vectors)


## Queries

### Scenario 1

We have a media object and we want to find the 5 most similar media objects.

In [28]:
_query = """
    WITH target_embedding AS (
    SELECT
        media_id,
        CASE
            WHEN embedding_size = 1024 THEN embedding_1024
            WHEN embedding_size = 2048 THEN embedding_2048
            ELSE NULL
        END AS embedding_vector
    FROM 
        feature
    WHERE 
        media_id = 5
    )

    SELECT
    f.media_id,
    (target.embedding_vector <-> 
        CASE
        WHEN f.embedding_size = 1024 THEN f.embedding_1024
        WHEN f.embedding_size = 2048 THEN f.embedding_2048
        ELSE NULL
        END
    ) AS distance
    FROM
    feature f,
    target_embedding target
    WHERE
    f.media_id != target.media_id
    ORDER BY
    distance ASC
    LIMIT 5;
"""

execute_read_query(_query)

[(2, 12.6693015514196),
 (8, 12.880733172073),
 (4, 12.881589969474),
 (7, 12.9209399099366),
 (1, 12.967205309647)]

### Scenario 2

Find all media objects for Zurich.

In [29]:
# get all media objects that have city: "Zurich" (queried from the jsonb metadata field)
_query = """
    SELECT * FROM media WHERE metadata->>'city' = 'Zurich';
"""
execute_read_query(_query)

[(1,
  '/path/to/media1',
  '/path/to/original1',
  datetime.datetime(2023, 5, 4, 12, 6, 17, 657201),
  'image',
  'jpg',
  500,
  {'city': 'Zurich', 'key1': 'value1', 'year': 2015},
  1,
  None,
  None,
  None,
  None,
  None,
  None,
  None),
 (11,
  '/path/to/media11',
  '/path/to/original11',
  datetime.datetime(2023, 5, 4, 12, 6, 18, 143418),
  'image',
  'jpg',
  500,
  {'city': 'Zurich', 'year': 2019, 'key11': 'value11'},
  1,
  None,
  None,
  None,
  None,
  None,
  None,
  None)]

### Scenario 3

Get all images from Geneva

In [30]:
_query = """
    SELECT * FROM media WHERE metadata->>'city' = 'Geneva' AND media_type = 'image';
"""
execute_read_query(_query)


[(2,
  '/path/to/media2',
  '/path/to/original2',
  datetime.datetime(2023, 5, 4, 12, 6, 17, 697058),
  'image',
  'jpg',
  500,
  {'city': 'Geneva', 'key2': 'value2', 'year': 2016},
  1,
  None,
  None,
  None,
  None,
  None,
  None,
  None),
 (12,
  '/path/to/media12',
  '/path/to/original12',
  datetime.datetime(2023, 5, 4, 12, 6, 18, 195346),
  'image',
  'jpg',
  500,
  {'city': 'Geneva', 'year': 2020, 'key12': 'value12'},
  1,
  None,
  None,
  None,
  None,
  None,
  None,
  None)]

### Scenario 4

Fulltext string matching on jsonb fields

In [39]:
# query for cities that start with b
_query = """
    SELECT metadata FROM media WHERE metadata->>'city' LIKE 'B%';
"""
print([x[0]['city'] for x in execute_read_query(_query)])

_query = """
    SELECT metadata FROM media WHERE metadata->>'city' LIKE '%Gall%';
"""
[x[0]['city'] for x in execute_read_query(_query)]



['Basel', 'Bern', 'Biel/Bienne', 'Basel', 'Bern', 'Biel/Bienne']


['St. Gallen', 'St. Gallen']

### Scenario 5

Similarity to a computed vector. Example: we have a video camera installed and a user poses like a tennis player and we find tennis matches.

In [40]:
vector_1024 = ",".join([str(x) for x in np.random.rand(1024).tolist()])  # feature that would be creating by the pose detection algorithm

# query the feature table for similar vectors
_query = f"""
    SELECT
    f.media_id,
    ('[{vector_1024}]' <-> 
        CASE
        WHEN f.embedding_size = 1024 THEN f.embedding_1024
        WHEN f.embedding_size = 2048 THEN f.embedding_2048
        ELSE NULL
        END
    ) AS distance
    FROM
    feature f
    WHERE
    f.feature_type = 'pose'
    ORDER BY
    distance ASC
    LIMIT 5;
"""

execute_read_query(_query)


[(8, 13.0702585003252), (4, 13.2722672142341)]

### Scenario 6

Find people or locations with ner tags

In [41]:
person = "Ueli Steck"

_query = f"""
    SELECT media_id, data FROM feature WHERE data->>'ner' LIKE '%{person}%';
"""

execute_read_query(_query)


[(1,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lucerne', 47, 54],
    ['city', 'st. gallen', 55, 65],
    ['city', 'lugano', 66, 72],
    ['city', 'biel/bienne', 73, 85]],
   'color': 'green',
   'data1': '1'}),
 (2,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lucerne', 47, 54],
    ['city', 'st. gallen', 55, 65],
    ['city', 'lugano', 66, 72],
    ['city', 'biel/bienne', 73, 85]],
   'color': 'blue',
   'data1': '2'}),
 (3,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lu

### Scenario 7

Find aribitrary features by metadata (here we use the simple field color as an example)

In [42]:
color = "red"

_query = f"""
    SELECT media_id, data FROM feature WHERE data->>'color' = '{color}';
"""

execute_read_query(_query)


[(10,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lucerne', 47, 54],
    ['city', 'st. gallen', 55, 65],
    ['city', 'lugano', 66, 72],
    ['city', 'biel/bienne', 73, 85]],
   'color': 'red',
   'data1': '10'})]

# Ingest the clips to the media table

In [43]:
REMOTE_RTS_DATA = "/media/sinergia/RTS/"
REMOTE_VIDEOS = '/mnt/rts/'

LOCAL_RTS_DATA = "/media/data/rts/"
METADATA = LOCAL_RTS_DATA + 'metadata'
LOCAL_VIDEOS = LOCAL_RTS_DATA + 'archive'

AIBOX = LOCAL_RTS_DATA + 'aibox-vectors'

OUTDIR = 'data'

In [44]:
from datetime import datetime
import orjson
import pandas as pd
import zipfile
import os
import io
from typing import Dict, List, Optional, Tuple
from pathlib import Path
import shutil
import hashlib
from supabase import create_client, Client

In [45]:
# LOCAL imports
import emv
import emv.pipeline
import emv.utils
import emv.io.media
import emv.features.audio
import emv.features.text

LOG = emv.utils.get_logger()

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
supabase: Client = create_client(
    os.getenv("SUPABASE_HOST"), 
    os.getenv("SUPABASE_KEY")
)

In [46]:
bucket_name = "rts"
# res = supabase.storage.create_bucket(bucket_name)

In [47]:
df = emv.utils.dataframe_from_hdf5(LOCAL_RTS_DATA + '/metadata', 'rts_aivectors')
df.shape

(3177, 22)

In [48]:
# get all the clips from the archive and put them into supabase s3. At the same time create the database entries on the media table

import json
from tqdm import tqdm


def ingest_clips(df: pd.DataFrame, prefix_name: str, supabase: Client, bucket_name: str, library_id: int) -> Tuple[int, int]:
    error_count = 0
    no_clips = 0
    default_format = "mp4"

    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        base_path = os.path.join(row.mediaFolderPath.replace(REMOTE_VIDEOS, LOCAL_VIDEOS + '/'))
        path = os.path.join(base_path, 'clips', 'videos')

        # get all files in the folder
        try:
            files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
            no_clips += len(files)

            with open(os.path.join(base_path, "clips.json")) as f:
                clip_metadata = orjson.loads(f.read())

            metadata = {
                "image_resolutions": clip_metadata['image_resolutions'],
                "framerate": clip_metadata['framerate'],
                "clip_count": clip_metadata['clip_count'],
            }

            # TODO: Create parent media file for the source video for clips
            parent_id = 1  # TODO: get the parent id from the database
            original_path = f"{base_path}.{default_format}"
            hash = hashlib.md5((original_path + str(0) + str(0)).encode('utf-8')).hexdigest()

            write_media_object_db(
                media_path=f"{prefix_name}/{row.mediaFolderPath.replace(REMOTE_VIDEOS, '')}.{default_format}",
                original_path=original_path,
                library_id=library_id,
                parent_id=0,
                update_data=json.dumps(metadata),
                media_type='video',
                media_sub_type='source_video',
                frame_rate=clip_metadata['framerate'],
                hash=hash,
            )

            # upload all files to supabase and create the database entries
            for f in files:
                supabase_key = f"{prefix_name}/{row.mediaFolderPath.replace(REMOTE_VIDEOS, '')}/clips/videos/{f}"
                file_path = os.path.join(path, files[0])

                file_size = os.path.getsize(file_path)
                
                clip_name = f.split('.')[0]
                start_ts = clip_metadata['clips'][clip_name]['start']
                end_ts = clip_metadata['clips'][clip_name]['end']
                start_frame = clip_metadata['clips'][clip_name]['start_frame']
                end_frame = clip_metadata['clips'][clip_name]['end_frame']
                frame_rate = clip_metadata['framerate']

                hash = hashlib.md5((file_path + str(start_frame) + str(end_frame)).encode('utf-8')).hexdigest()

                update_data = json.dumps(metadata | {'ref_text': clip_metadata['clips'][clip_name]['ref_text'], 'locations': clip_metadata['clips'][clip_name]['locations']})
                update_data = update_data.replace("'", "''")

                write_media_object_db(
                    media_path=supabase_key,
                    original_path=os.path.abspath(file_path),
                    library_id=library_id,
                    parent_id=parent_id,
                    start_ts=start_ts,
                    end_ts=end_ts,
                    start_frame=start_frame,
                    end_frame=end_frame,
                    frame_rate=frame_rate,
                    update_data=update_data,
                    file_size=file_size,
                    hash=hash,
                    media_type='video',
                    media_sub_type='clip'
                )

                # upload to supabase s3
                # supabase.storage.from_(bucket_name).upload(supabase_key, os.path.abspath(file_path))

        except FileNotFoundError:
            error_count += 1
            continue

    print(f"No clips folund for {error_count} rows")
    print(f"Total number of clips: {no_clips}")
    print(f"Total number of vidoes with clips: {len(df) - error_count}")


In [51]:
bucket_name = "rts"
library_id = "1"

r = ingest_clips(df, "test4", supabase, bucket_name, library_id)

  5%|▍         | 146/3177 [00:05<01:54, 26.50it/s]


KeyboardInterrupt: 

x