# Multimodal video search using CLIP and LanceDB
We used LanceDB to store frames every thirty seconds and the title of 13000+ videos, 5 random from each top category from the Youtube 8M dataset. 
Then, we used the CLIP model to embed frames and titles together. Here are the results.

In [1]:
!pip install --quiet -U lancedb
!pip install --quiet gradio transformers torch torchvision duckdb
!pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985

Collecting tantivy@ git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
  Cloning https://github.com/quickwit-oss/tantivy-py to /private/var/folders/0d/54vc82d97537gkrc254fy5bm0000gn/T/pip-install-mjh59iz5/tantivy_f62e00e0b4b64907a560b6fcf8a72d69
  Running command git clone --filter=blob:none --quiet https://github.com/quickwit-oss/tantivy-py /private/var/folders/0d/54vc82d97537gkrc254fy5bm0000gn/T/pip-install-mjh59iz5/tantivy_f62e00e0b4b64907a560b6fcf8a72d69
  Resolved https://github.com/quickwit-oss/tantivy-py to commit bbcd1f2aefd42e21982d8f044c98c567a304e3b0
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: tantivy
  Building wheel for tantivy (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for tantivy [0m[1;3

## First run setup: Download data and pre-process


In [1]:
import io
import PIL
import duckdb
import lancedb

In [23]:
# !wget https://vectordb-recipes.s3.us-west-2.amazonaws.com/multimodal_video_lance.tar.gz
!tar -xvf multimodal_video_lance.tar.gz
!mv multimodal_video.lance rawdata.lance

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
x multimodal_video.lance/
x multimodal_video.lance/_versions/
x multimodal_video.lance/_latest.manifest
x multimodal_video.lance/data/
x multimodal_video.lance/data/cade1563-8f3a-4e0c-91f5-f2645ce610fe.lance
x multimodal_video.lance/_versions/1.manifest
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Create / Open LanceDB Table

In [24]:
import pyarrow.compute as pc
import lance

db = lancedb.connect("data/video-lancedb")
if "multimodal_video" in db.table_names():
    tbl= db.open_table("multimodal_video")
else:
    # First data processing and full-text-search index
    data = lance.dataset("rawdata.lance").to_table()
    # remove null prompts
    tbl = db.create_table("multimodal_video", data.filter(~pc.field("text").is_null()), mode="overwrite")
    # tbl.create_fts_index(["text"])

## Create CLIP embedding function for the text

In [25]:
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast

MODEL_ID = "openai/clip-vit-base-patch32"

tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)
model = CLIPModel.from_pretrained(MODEL_ID)
processor = CLIPProcessor.from_pretrained(MODEL_ID)

def embed_func(query):
    inputs = tokenizer([query], padding=True, return_tensors="pt")
    text_features = model.get_text_features(**inputs)
    return text_features.detach().numpy()[0]


## Search functions for Gradio

In [27]:
def find_video_vectors(query):
    emb = embed_func(query)
    print(emb)
    code = (
        "import lancedb\n"
        "db = lancedb.connect('data/video-lancedb')\n"
        "tbl = db.open_table('multimodal_video')\n\n"
        f"embedding = embed_func('{query}')\n"
        "tbl.search(embedding).limit(9).to_df()"
    )
    return (_extract(tbl.search(emb).limit(9).to_df()), code)

def find_video_keywords(query):
    code = (
        "import lancedb\n"
        "db = lancedb.connect('data/video-lancedb')\n"
        "tbl = db.open_table('multimodal_video')\n\n"
        f"tbl.search('{query}').where(video_id IN (SELECT DISTINCT video_id FROM videos);).limit(9).to_df()"
    )
    return (_extract(tbl.search(query).limit(9).to_df()), code)

def find_video_sql(query):
    code = (
        "import lancedb\n"
        "import duckdb\n"
        "db = lancedb.connect('data/video-lancedb')\n"
        "tbl = db.open_table('multimodal_video')\n\n"
        "videos = tbl.to_lance()\n"
        f"duckdb.sql('{query}').to_df()"
    )    
    videos = tbl.to_lance()
    return (_extract(duckdb.sql(query).to_df()), code)

def _extract(df):
    video_id_col = "video_id"
    start_time_col = "start_time"
    grid_html = '<div style="display: grid; grid-template-columns: repeat(3, 1fr); grid-gap: 20px;">'

    for _, row in df.iterrows():
        iframe_code = f'<iframe width="100%" height="315" src="https://www.youtube.com/embed/{row[video_id_col]}?start={str(row[start_time_col])}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
        grid_html += f'<div style="width: 100%;">{iframe_code}</div>'

    grid_html += '</div>'
    return grid_html

## Setup Gradio interface

In [28]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown('''
            # Multimodal video search using CLIP and LanceDB
            We used LanceDB to store frames every thirty seconds and the title of 13000+ videos, 5 random from each top category from the Youtube 8M dataset. 
            Then, we used the CLIP model to embed frames and titles together. Here are the results.
            ''')
    with gr.Row():
        with gr.Tab("Embeddings"):
            vector_query = gr.Textbox(value="cooking show", show_label=False)
            b1 = gr.Button("Submit")
        # with gr.Tab("Keywords"):
        #     keyword_query = gr.Textbox(value="car show", show_label=False)
        #     b2 = gr.Button("Submit")
        with gr.Tab("SQL"):
            sql_query = gr.Textbox(value="SELECT DISTINCT * from videos WHERE start_time > 0 LIMIT 9", show_label=False)
            b3 = gr.Button("Submit")
    with gr.Row():
        code = gr.Code(label="Code", language="python")
    with gr.Row():
        gallery = gr.HTML()
        
    b1.click(find_video_vectors, inputs=vector_query, outputs=[gallery, code])
    # b2.click(find_video_keywords, inputs=keyword_query, outputs=[gallery, code])
    b3.click(find_video_sql, inputs=sql_query, outputs=[gallery, code])
    
demo.launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




[-3.89971197e-01 -3.30181420e-01  1.23929031e-01 -1.54020816e-01
 -3.39424759e-02  3.82081866e-02 -1.41339991e-02 -8.16765666e-01
 -1.65801570e-01  6.66929781e-02  3.40401269e-02 -3.42541844e-01
  4.30191100e-01 -2.94147074e-01  3.01839799e-01  3.16303447e-02
  4.23838079e-01  1.03325911e-01 -1.10189289e-01 -8.35672319e-02
  1.24252237e-01 -2.23023035e-02 -2.39000067e-01  1.36929601e-02
  2.66885102e-01  1.47141386e-02 -2.65504092e-01 -1.72435865e-01
  5.20671569e-02  2.42489189e-01 -2.66974032e-01 -2.48410404e-01
  1.42571330e-01 -7.65645951e-02 -8.13523412e-01 -2.29454145e-01
  5.31561207e-03 -9.93065983e-02  1.27078354e-01 -3.74382079e-01
  1.95134118e-01  4.36310470e-01  2.92363763e-01 -1.41465604e-01
 -8.24835002e-02  3.31938565e-01  3.64273190e-01  1.16413757e-01
  6.00578427e-01  1.49502158e-02 -2.81053126e-01 -3.71742010e-01
  1.30644947e-01 -5.86193912e-02  2.33290747e-01 -1.28948107e-01
 -3.06639552e-01  2.04674155e-01  1.24294572e-02 -6.36680424e-02
 -4.06730533e-01 -7.38285