In [None]:
!pip install -U datasets sentence-transformers tinydb faiss-cpu tqdm
!pip install gradio -q
import gradio as gr
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np
import faiss
import sqlite3

In [None]:
# Load dataset
ytt = load_dataset("pinecone/yt-transcriptions", split="train", revision="926a45")


In [None]:
# Load transformer model
retriever = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')
embed_dim = retriever.get_sentence_embedding_dimension()

In [4]:
embeddings = []


In [5]:
for entry in tqdm(ytt, desc="Computing embeddings"):
    embedding = retriever.encode(entry['text']).tolist()
    entry['embedding'] = embedding
    embeddings.append(embedding)


Computing embeddings:   0%|          | 0/11298 [00:00<?, ?it/s]

In [6]:
embeddings_np = np.array(embeddings, dtype=np.float32)
index = faiss.IndexFlatL2(embed_dim)
index.add(embeddings_np)

In [7]:
# Connect to the database
conn = sqlite3.connect('yt-transcriptions.db')

In [8]:
# Create a table to store the video information and embeddings
conn.execute("""
CREATE TABLE IF NOT EXISTS videos (
    id INTEGER PRIMARY KEY,
    url TEXT,
    title TEXT,
    text TEXT,
    embedding BLOB
);
""")
conn.commit()

# This is what's breaking and simply not working, populating the db and index with our embeddings

In [20]:
# Populate the SQLite database and the Faiss index with video information and embeddings
for i in range(len(ytt)):
    entry = ytt[i]
    url = entry['url']
    title = entry['title']
    text = entry['text']
    embeddings = retriever.encode([text])[0].tolist()
        
    # Ensure embeddings is a list even if there is only one embedding
    if not isinstance(embeddings[0], list):
        embeddings = [embeddings]
        
    # Store the video information and embedding in the SQLite database
    conn.execute("""
    INSERT OR REPLACE INTO videos (id, url, title, text, embeddings)
    VALUES (?, ?, ?, ?, ?)
    """, (i, url, title, text, sqlite3.Binary(np.array(embeddings, dtype=np.float32))))
        
    # Add the embeddings to the Faiss index
    index.add(np.array(embeddings, dtype=np.float32))
    
conn.commit()



OperationalError: ignored

In [18]:
# Populate the SQLite database and the Faiss index with video information and embeddings
for i in range(len(ytt)):
    print(f"Processing example {i}")
    entry = ytt[i]
    if len(entry) != 2:
        print(f"Unexpected number of items in entry: {len(entry)}")
        print(entry)
    url = entry['url']
    title = entry['title']
    text = entry['text']
    embedding = retriever.encode(text).tolist()
        
    # Store the video information and embedding in the SQLite database
    conn.execute("""
    INSERT OR REPLACE INTO videos (id, url, title, text, embedding)
    VALUES (?, ?, ?, ?, ?)
    """, (i, url, title, text, sqlite3.Binary(np.array(embedding, dtype=np.float32))))
        
    # Add the embedding to the Faiss index
    index.add(np.array(embedding, dtype=np.float32))
    
conn.commit()


Processing example 0
Unexpected number of items in entry: 7
{'video_id': 'ZPewmEu7644', 'text': " hi this is Jeff Dean welcome to applications of deep neural networks of Washington University in this video we're going to look at how we can use ganz to generate additional training data for the latest on my a I course and projects click subscribe in the bell next to it to be notified of every new video Dan's have a wide array of uses beyond just the face generation that you", 'start_second': 0, 'end_second': 20, 'url': 'https://www.youtube.com/watch?v=ZPewmEu7644&t=0s', 'title': 'GANS for Semi-Supervised Learning in Keras (7.4)', 'thumbnail': 'https://i.ytimg.com/vi/ZPewmEu7644/maxresdefault.jpg'}


ValueError: ignored

In [15]:
# Populate the SQLite database and the Faiss index with video information and embeddings
for i in tqdm(range(len(ytt)), desc="Populating SQLite and Faiss"):
    entry = ytt[i]
    url = entry['url']
    title = entry['title']
    text = entry['text']
    embedding = retriever.encode(text).tolist()
        
    # Store the video information and embedding in the SQLite database
    conn.execute("""
    INSERT OR REPLACE INTO videos (id, url, title, text, embedding)
    VALUES (?, ?, ?, ?, ?)
    """, (i, url, title, text, sqlite3.Binary(np.array(embedding, dtype=np.float32))))
        
    # Add the embedding to the Faiss index
    index.add(np.array(embedding, dtype=np.float32))
    
conn.commit()


Populating SQLite and Faiss:   0%|          | 0/11298 [00:00<?, ?it/s]

ValueError: ignored

In [16]:
# Populate the SQLite database and the Faiss index with video information and embeddings
i = 0
for entry in tqdm(ytt, desc="Populating SQLite and Faiss"):
    url = entry['url']
    title = entry['title']
    text = entry['text']
    embedding = retriever.encode(text).tolist()
        
    # Store the video information and embedding in the SQLite database
    conn.execute("""
    INSERT INTO videos (id, url, title, text, embedding)
    VALUES (?, ?, ?, ?, ?)
    """, (i, url, title, text, sqlite3.Binary(np.array(embedding, dtype=np.float32))))
        
    # Add the embedding to the Faiss index
    index.add(np.array(embedding, dtype=np.float32))
    
    i += 1
    
conn.commit()

Populating SQLite and Faiss:   0%|          | 0/11298 [00:00<?, ?it/s]

IntegrityError: ignored

# Semantic search + gradio

In [None]:
# Perform semantic search using the Faiss index
def sem_search(query):
    xq = retriever.encode(query).tolist()
    xq_np = np.array([xq], dtype=np.float32)

    top_k = 5

    # Search in the Faiss index
    distances, indices = index.search(xq_np, top_k)

    results = []

    for i in range(top_k):
        idx = indices[0][i]
        video = conn.execute("""
        SELECT * FROM videos WHERE id=?
        """, (idx,)).fetchone()
        url = video[1]
        text = video[3]
        result = f"{text}\n\n{url}\n\n{'-'*50}\n"
        results.append(result)

    return "\n".join(results)

In [None]:
# Gradio interface
gr.Interface(
    fn=sem_search, 
    inputs=["text"], 
    outputs="text",
    examples=[["Deep Learning"], ["Neural Networks"], ["Coding"], ["Computer Vision"], ["Natural Language Processing"], ["Reinforcement Learning"]],
    examples_per_page=6
).launch()