# Database Examples 

Examples for the new database schema and how to use it.

In [1]:
! pip install psycopg2-binary python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# jupyter notebook auto reload
%cd ..
%load_ext autoreload
%autoreload 2

/home/andre
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Setup

Utility functions for setting up the database.

In [5]:
import os
import psycopg2
from dotenv import load_dotenv
from rts.db.utils import execute_read_query, execute_write_query

load_dotenv()

False

## Tables

Creating the necessary database tables

In [87]:
execute_write_query("DROP TABLE IF EXISTS map_projection_feature;")
execute_write_query("DROP TABLE IF EXISTS projection;")
execute_write_query("DROP TABLE IF EXISTS features;")
execute_write_query("DROP TABLE IF EXISTS media;")
execute_write_query("DROP TABLE IF EXISTS library;")

In [88]:
# postgres table definitions

_table_library = """
    CREATE TABLE IF NOT EXISTS library (
        library_id SERIAL PRIMARY KEY,
        library_name VARCHAR(50) NOT NULL,
        version VARCHAR(20) NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        data JSONB NOT NULL
    );
"""

_create_library = """
    INSERT INTO library (library_name, version, data)
    VALUES ('rts', '0.1', '{}')
    RETURNING library_id;
"""

_table_projection = """
    CREATE TABLE IF NOT EXISTS projection (
        projection_id SERIAL PRIMARY KEY,
        version VARCHAR(20) NOT NULL,
        library_id INTEGER NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        model_name VARCHAR(200) NOT NULL,
        model_params JSONB NOT NULL,
        data JSONB NOT NULL,
        dimension INTEGER NOT NULL,
        atlas_folder_path VARCHAR(500) NOT NULL,
        atlas_width INTEGER NOT NULL,
        tile_size INTEGER NOT NULL,
        atlas_count INTEGER NOT NULL,
        total_tiles INTEGER NOT NULL,
        tiles_per_atlas INTEGER NOT NULL,

        CONSTRAINT FK_projection_library_id FOREIGN KEY (library_id)
            REFERENCES library (library_id)
    );
"""

_table_media = """
    CREATE TABLE IF NOT EXISTS media (
        media_id SERIAL PRIMARY KEY,
        media_path VARCHAR(500) UNIQUE,
        original_path VARCHAR(500) NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        media_type VARCHAR(50) NOT NULL,
        sub_type VARCHAR(50) NOT NULL,
        size INTEGER NOT NULL,
        metadata JSONB NOT NULL,
        library_id INTEGER NOT NULL,
        hash VARCHAR(50) NOT NULL,
        parent_id INTEGER,
        start_ts FLOAT,
        end_ts FLOAT,
        start_frame INTEGER,
        end_frame INTEGER,
        frame_rate INTEGER,

        CONSTRAINT FK_media_library_id FOREIGN KEY (library_id)
            REFERENCES library (library_id)
    );
"""

_table_features = """
    CREATE TABLE IF NOT EXISTS features (
        feature_id SERIAL PRIMARY KEY,
        feature_type VARCHAR(50) NOT NULL,
        version VARCHAR(20) NOT NULL,
        created_at TIMESTAMP DEFAULT NOW(),
        model_name VARCHAR(200) NOT NULL,
        model_params JSONB NOT NULL,
        data JSONB NOT NULL,

        embedding_size INTEGER,
        embedding_1024 vector (1024),
        embedding_1536 vector (1536),
        embedding_2048 vector (2048),

        media_id INTEGER,

        CONSTRAINT FK_features_media_id FOREIGN KEY (media_id) 
            REFERENCES media (media_id)
    );
"""

_table_map_projection_feature = """
    CREATE TABLE IF NOT EXISTS map_projection_feature (
        map_projection_feature_id SERIAL PRIMARY KEY,
        projection_id INTEGER NOT NULL,
        feature_id INTEGER NOT NULL,
        atlas_order INTEGER NOT NULL,

        CONSTRAINT FK_map_projection_feature_projection_id FOREIGN KEY (projection_id)
            REFERENCES projection (projection_id),
        CONSTRAINT FK_map_projection_feature_feature_id FOREIGN KEY (feature_id)
            REFERENCES features (feature_id)
    );
"""

execute_write_query(_table_library)
execute_write_query(_table_projection)
execute_write_query(_table_media)
execute_write_query(_table_features)
execute_write_query(_table_map_projection_feature)

print(execute_read_query(_create_library))

[(1,)]


## Fill tables with sample data

In [9]:
# create sample data for the tables, each media element can have multiple features

swiss_cities = ["Zurich", "Geneva", "Basel", "Lausanne", "Bern", "Winterthur", "Lucerne", "St. Gallen", "Lugano", "Biel/Bienne"]
years = [2015, 2016, 2017, 2018, 2019, 2020]

for i in range(1, 21):
    _table_media_sample = f"""
        INSERT INTO media (media_path, original_path, media_type, sub_type, size, archive_name, archive_id, metadata)
        VALUES 
            ('/path/to/media{i}', '/path/to/original{i}', 'image', 'jpg', 500, 'archive{i}', 'archive_id{i}', '{{"key{i}": "value{i}", "city": "{swiss_cities[(i-1) % len(swiss_cities)]}", "year": {years[(i-1) % len(years)]}}}')
        RETURNING media_id;
    """
    execute_write_query(_table_media_sample)


In [10]:
_query = """SELECT * FROM media LIMIT 2;"""
execute_read_query(_query)

[(1,
  '/path/to/media1',
  '/path/to/original1',
  datetime.datetime(2023, 5, 2, 10, 35, 53, 70041),
  'image',
  'jpg',
  500,
  'archive1',
  'archive_id1',
  {'city': 'Zurich', 'key1': 'value1', 'year': 2015}),
 (2,
  '/path/to/media2',
  '/path/to/original2',
  datetime.datetime(2023, 5, 2, 10, 35, 53, 107340),
  'image',
  'jpg',
  500,
  'archive2',
  'archive_id2',
  {'city': 'Geneva', 'key2': 'value2', 'year': 2016})]

In [11]:
# insert some samples with vectors
import numpy as np
import json

feature_types = ["pose", "face", "object", "ner"]
color_meta = ["red", "green", "blue", "yellow", "black", "white", "grey", "orange", "purple", "pink"]

for i in range(1, 11):
    vector_1024 = np.random.rand(1024).tolist()
    vector_2048 = np.random.rand(2048).tolist()
    ner_tags = json.dumps([("person", "Ueli Steck", 1, 9), ("city", "geneva", 10, 16), ("city", "zurich", 17, 23), ("city", "bern", 24, 28), ("city", "basel", 29, 34), ("city", "winterthur", 35, 46), ("city", "lucerne", 47, 54), ("city", "st. gallen", 55, 65), ("city", "lugano", 66, 72), ("city", "biel/bienne", 73, 85)])

    _table_features_sample_vectors = f"""
        INSERT INTO features (feature_type, version, model_name, model_params, data, media_id, embedding_size, embedding_1024)
        VALUES
            ('{feature_types[i % len(feature_types)]}', 'v1', 'resnet50', '{{"param1": "{i}"}}', '{{"color": "{color_meta[i % len(color_meta)] }", "data1": "{i}", "ner": {ner_tags} }}', {i}, 1024, ARRAY[{','.join([str(x) for x in vector_1024])}])
        RETURNING feature_id;
    """
    # print(_table_features_sample_vectors)
    execute_write_query(_table_features_sample_vectors)

    # At the moment we are only creating size 1024 vectors, for the sake of the next example queries to work, there can be only a single vector set per feature
    # _table_features_sample_vectors = f"""
    #     INSERT INTO features (feature_type, version, model_name, model_params, data, media_id, embedding_size, embedding_2048)
    #     VALUES
    #         ('image', 'v1', 'resnet50', '{{"param1": "{i}"}}', '{{"data1": "{i}"}}', {i}, 2048, ARRAY[{','.join([str(x) for x in vector_2048])}])
    #     RETURNING feature_id;
    # """

    # execute_write_query(_table_features_sample_vectors)


## Queries

### Scenario 1

We have a media object and we want to find the 5 most similar media objects.

In [12]:
_query = """
    WITH target_embedding AS (
    SELECT
        media_id,
        CASE
            WHEN embedding_size = 1024 THEN embedding_1024
            WHEN embedding_size = 2048 THEN embedding_2048
            ELSE NULL
        END AS embedding_vector
    FROM 
        features
    WHERE 
        media_id = 5
    )

    SELECT
    f.media_id,
    (target.embedding_vector <-> 
        CASE
        WHEN f.embedding_size = 1024 THEN f.embedding_1024
        WHEN f.embedding_size = 2048 THEN f.embedding_2048
        ELSE NULL
        END
    ) AS distance
    FROM
    features f,
    target_embedding target
    WHERE
    f.media_id != target.media_id
    ORDER BY
    distance ASC
    LIMIT 5;
"""

execute_read_query(_query)

[(4, 12.6620561267192),
 (1, 12.9727147189136),
 (7, 12.9900472952322),
 (8, 13.0338304337032),
 (2, 13.0584285629705)]

### Scenario 2

Find all media objects for Zurich.

In [13]:
# get all media objects that have city: "Zurich" (queried from the jsonb metadata field)
_query = """
    SELECT * FROM media WHERE metadata->>'city' = 'Zurich';
"""
execute_read_query(_query)

[(1,
  '/path/to/media1',
  '/path/to/original1',
  datetime.datetime(2023, 5, 2, 10, 35, 53, 70041),
  'image',
  'jpg',
  500,
  'archive1',
  'archive_id1',
  {'city': 'Zurich', 'key1': 'value1', 'year': 2015}),
 (11,
  '/path/to/media11',
  '/path/to/original11',
  datetime.datetime(2023, 5, 2, 10, 35, 53, 446582),
  'image',
  'jpg',
  500,
  'archive11',
  'archive_id11',
  {'city': 'Zurich', 'year': 2019, 'key11': 'value11'})]

### Scenario 3

Get all images from Geneva

In [14]:
_query = """
    SELECT * FROM media WHERE metadata->>'city' = 'Geneva' AND media_type = 'image';
"""
execute_read_query(_query)


[(2,
  '/path/to/media2',
  '/path/to/original2',
  datetime.datetime(2023, 5, 2, 10, 35, 53, 107340),
  'image',
  'jpg',
  500,
  'archive2',
  'archive_id2',
  {'city': 'Geneva', 'key2': 'value2', 'year': 2016}),
 (12,
  '/path/to/media12',
  '/path/to/original12',
  datetime.datetime(2023, 5, 2, 10, 35, 53, 479945),
  'image',
  'jpg',
  500,
  'archive12',
  'archive_id12',
  {'city': 'Geneva', 'year': 2020, 'key12': 'value12'})]

### Scenario 4

Fulltext string matching on jsonb fields

In [15]:
# query for cities that start with b
_query = """
    SELECT * FROM media WHERE metadata->>'city' LIKE 'B%';
"""
print([x[9]['city'] for x in execute_read_query(_query)])

_query = """
    SELECT * FROM media WHERE metadata->>'city' LIKE '%Gall%';
"""
[x[9]['city'] for x in execute_read_query(_query)]



['Basel', 'Bern', 'Biel/Bienne', 'Basel', 'Bern', 'Biel/Bienne']


['St. Gallen', 'St. Gallen']

### Scenario 5

Similarity to a computed vector. Example: we have a video camera installed and a user poses like a tennis player and we find tennis matches.

In [16]:
vector_1024 = ",".join([str(x) for x in np.random.rand(1024).tolist()])  # feature that would be creating by the pose detection algorithm

# query the feature table for similar vectors
_query = f"""
    SELECT
    f.media_id,
    ('[{vector_1024}]' <-> 
        CASE
        WHEN f.embedding_size = 1024 THEN f.embedding_1024
        WHEN f.embedding_size = 2048 THEN f.embedding_2048
        ELSE NULL
        END
    ) AS distance
    FROM
    features f
    WHERE
    f.feature_type = 'pose'
    ORDER BY
    distance ASC
    LIMIT 5;
"""

execute_read_query(_query)


[(8, 12.9973420438942), (4, 13.3367706705129)]

### Scenario 6

Find people or locations with ner tags

In [17]:
person = "Ueli Steck"

_query = f"""
    SELECT media_id, data FROM features WHERE data->>'ner' LIKE '%{person}%';
"""

execute_read_query(_query)


[(1,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lucerne', 47, 54],
    ['city', 'st. gallen', 55, 65],
    ['city', 'lugano', 66, 72],
    ['city', 'biel/bienne', 73, 85]],
   'color': 'green',
   'data1': '1'}),
 (2,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lucerne', 47, 54],
    ['city', 'st. gallen', 55, 65],
    ['city', 'lugano', 66, 72],
    ['city', 'biel/bienne', 73, 85]],
   'color': 'blue',
   'data1': '2'}),
 (3,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lu

### Scenario 7

Find aribitrary features by metadata (here we use the simple field color as an example)

In [18]:
color = "red"

_query = f"""
    SELECT media_id, data FROM features WHERE data->>'color' = '{color}';
"""

execute_read_query(_query)


[(10,
  {'ner': [['person', 'Ueli Steck', 1, 9],
    ['city', 'geneva', 10, 16],
    ['city', 'zurich', 17, 23],
    ['city', 'bern', 24, 28],
    ['city', 'basel', 29, 34],
    ['city', 'winterthur', 35, 46],
    ['city', 'lucerne', 47, 54],
    ['city', 'st. gallen', 55, 65],
    ['city', 'lugano', 66, 72],
    ['city', 'biel/bienne', 73, 85]],
   'color': 'red',
   'data1': '10'})]

# Ingest the clips to the media table

In [73]:
REMOTE_RTS_DATA = "/media/sinergia/RTS/"
REMOTE_VIDEOS = '/mnt/rts/'

LOCAL_RTS_DATA = "/media/data/rts/"
METADATA = LOCAL_RTS_DATA + 'metadata'
LOCAL_VIDEOS = LOCAL_RTS_DATA + 'archive'

AIBOX = LOCAL_RTS_DATA + 'aibox-vectors'

OUTDIR = 'data'

In [86]:
from datetime import datetime
import orjson
import pandas as pd
import zipfile
import os
import io
from typing import Dict, List, Optional, Tuple
from pathlib import Path
import shutil
import hashlib
from supabase import create_client, Client

In [75]:
# LOCAL imports
import rts
import rts.pipeline
import rts.utils
import rts.io.media
import rts.features.audio
import rts.features.text

LOG = rts.utils.get_logger()

In [76]:
supabase: Client = create_client(
    os.getenv("SUPABASE_HOST"), 
    os.getenv("SUPABASE_KEY")
)

In [36]:
bucket_name = "rts"
res = supabase.storage.create_bucket(bucket_name)

StorageException: {'statusCode': 400, 'error': 'Invalid JWT', 'message': 'new row violates row-level security policy for table "buckets"'}

In [77]:
df = rts.utils.dataframe_from_hdf5(LOCAL_RTS_DATA + '/metadata', 'rts_aivectors')
df.shape

(3177, 22)

In [78]:
print(f"{row.mediaFolderPath.replace(REMOTE_VIDEOS, '')}/clips/videos/{files[0]}")

5/1/0/ZE004015/clips/videos/ZE004015-L006.mp4


In [79]:
path

'/media/data/rts/archive/5/1/0/ZE004015/clips/videos'

In [62]:

# with open(os.path.join(path, files[0]), 'rb+') as f:
filename = os.path.join(path, files[0])
res = supabase.storage.from_('rts').upload("0/1/rts/test_supabase_upload.mp4-8", os.path.abspath(filename))


StorageException: {'statusCode': 400, 'error': 'Duplicate', 'message': 'The resource already exists'}

In [84]:
row.to_dict()

{'guid': 'AA1104002390',
 'mediaFolderPath': '/mnt/rts/5/1/0/ZE004015',
 'mediaDuration': 4437,
 'ratio': '16:9',
 'formatResolution': 'SD',
 'publishedDate': '2011-04-09T00:00:00Z',
 'categoryName': 'Programme',
 'assetType': 'Programme',
 'contentType': 'Pop et rock, clips',
 'backgoundType': None,
 'collection': 'MusicOmax',
 'publishedBy': 'TSR 2',
 'rights': 'Restriction/Condition',
 'title': 'WALDER / PAMPLEMOUSSE / NICOLAS FRAISSINET - 11.04.09',
 'resume': "* Lausanne : 20110409, magazine de la scène musicale romande et régionale animé par DESTRAZ Pierrick et REPOND Judith depuis le MAD de Lausanne avec leurs invités et des concerts d'artistes suisses et internationaux.",
 'geoTheme': ['LAUSANNE'],
 'resumeSequence': ['00:01.22\nLausanne, MAD, diverses séquences : diverses présentations DESTRAZ, Pierrick et REPOND, Judith avec leurs invités WALDER, auteur-illustrateur (son métier, ses 1ers dessins, ses débuts, ses BD, références, chansons, musique, dessin  d\'une pochette d\'al

In [47]:
# get all the clips from the archive and put them into supabase s3. At the same time create the database entries on the media table

bucket_name = "test"

error_count = 0
no_clips = 0

for i, row in df.iterrows():

    path = os.path.join(row.mediaFolderPath.replace(REMOTE_VIDEOS, LOCAL_VIDEOS + '/'), 'clips', 'videos')

    # get all files in the folder
    try:
        files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
        no_clips += len(files)

        # TODO: Create parent media file for the source video for clips
        pass

        # upload all files to supabase and create the database entries
        for f in files:
            supabase_key = f"{bucket_name}/{row.mediaFolderPath.replace(REMOTE_VIDEOS, '')}/clips/videos/{f}"
            filename = os.path.join(path, files[0])
            file_size = os.path.getsize(filename)
            library_id = "1"  # rts
            hash = hashlib.md5(open(filename, 'rb').read()).hexdigest()
            parent_id = 1  # TODO: get the parent id from the database
            start_ts, end_ts, start_frame, end_frame, frame_rate = 0, 0, 0, 0, 30  # TODO: get the values from the database
            
            # create the database entry
            _query = f"""
                INSERT INTO media (
                    media_path, original_path, media_type, sub_type, 
                    size, library_id, metadata, hash, 
                    parent_id, start_ts, end_ts, start_frame, 
                    end_frame, frame_rate)
                VALUES ('{supabase_key}', '{row.mediaFolderPath}', 'video', 'clip', 
                    {file_size}, {library_id}, '{json.dumps(row.to_dict())}', '{hash}', 
                    {parent_id}, {start_ts}, {end_ts}, {start_frame}, 
                    {end_frame}, {frame_rate})
                ON CONFLICT (media_id) DO NOTHING;
            """
            execute_write_query(_query)
            
            # upload to supabase s3
            supabase.storage.from_('rts').upload(supabase_key, os.path.abspath(filename))

    except FileNotFoundError:
        error_count += 1
        continue

print(f"No clips folund for {error_count} rows")
print(f"Total number of clips: {no_clips}")
print(f"Total number of vidoes with clips: {len(df) - error_count}")

No clips found for 0 rows
Total number of clips: 7
Total number of vidoes with clips: 3177


In [85]:
row.mediaFolderPath

'/mnt/rts/5/1/0/ZE004015'

In [40]:
path, row['mediaFolderPath'], files

('/media/data/rts/archive/1/2/0/ZB187021/clips/videos',
 '/mnt/rts/1/2/0/ZB187021',
 ['ZB023013-L010.mp4',
  'ZB023013-L009.mp4',
  'ZB023013-L000.mp4',
  'ZB023013-L023.mp4',
  'ZB023013-L017.mp4',
  'ZB023013-L024.mp4',
  'ZB023013-L012.mp4',
  'ZB023013-L018.mp4',
  'ZB023013-L014.mp4',
  'ZB023013-L015.mp4',
  'ZB023013-L032.mp4',
  'ZB023013-L004.mp4',
  'ZB023013-L033.mp4',
  'ZB023013-L001.mp4',
  'ZB023013-L011.mp4',
  'ZB023013-L028.mp4',
  'ZB023013-L025.mp4',
  'ZB023013-L031.mp4',
  'ZB023013-L022.mp4',
  'ZB023013-L013.mp4',
  'ZB023013-L021.mp4',
  'ZB023013-L016.mp4',
  'ZB023013-L034.mp4',
  'ZB023013-L019.mp4',
  'ZB023013-L005.mp4',
  'ZB023013-L002.mp4',
  'ZB023013-L020.mp4',
  'ZB023013-L027.mp4',
  'ZB023013-L029.mp4',
  'ZB023013-L030.mp4',
  'ZB023013-L006.mp4',
  'ZB023013-L008.mp4',
  'ZB023013-L007.mp4',
  'ZB023013-L003.mp4',
  'ZB023013-L026.mp4'])

x