# YouTube Channel Retrieval with Vector DBs

In [7]:
# !pip install pytube
# !pip install openai-whisper
# !pip install ffmpeg-python
# !pip install nltk
# !pip install sentence_transformers
# !pip install bertopic umap hdbscan scikit-learn
# !pip install nbformat
# !pip install redis

### Load all necessary libs

In [9]:
from pytube import YouTube, Channel
import pytube
from nltk.tokenize import sent_tokenize
import whisper
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# fix for empty YouTube channel
pytube.innertube._default_clients['ANDROID'] = pytube.innertube._default_clients['WEB']

In [6]:
# use tiny model for speed
whisper_model = whisper.load_model("tiny")

In [None]:
# create embeddings from the documents using multi-lingual model
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

### Scrape and transcribe audio for last 10 Lex Fridman's podcasts

In [10]:
# define YT channel
c = Channel(f'https://www.youtube.com/c/lexfridman/videos')

In [11]:
# check how many videos are in the channel
len(c.video_urls)

781

In [22]:
# collect some useful data about the channel

video_titles = []
video_descriptions = []
video_dates = []
video_urls = []

for i, video in enumerate(c.videos[:10]):
    video_titles.append(video.title)
    video_dates.append(video.publish_date)
    video_descriptions.append(video.description)
    video_urls.append(video.watch_url)

In [23]:
video_titles[:3]

['Jared Kushner: Israel, Palestine, Hamas, Gaza, Iran, and the Middle East | Lex Fridman Podcast #399',
 'Mark Zuckerberg: First Interview in the Metaverse | Lex Fridman Podcast #398',
 'Greg Lukianoff: Cancel Culture, Deplatforming, Censorship & Free Speech | Lex Fridman Podcast #397']

In [25]:
# download the audio from the videos and transcribe them
texts = []
for i, url in enumerate(tqdm(video_urls[:10]), start=1):
    path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="audio.mp4")
    transcription = whisper_model.transcribe(path)
    with open(f"./{str(i).zfill(3)}_{video_titles[i]}.txt", "wb") as f:
        f.write(transcription["text"])
    texts.append(transcription["text"])

In [74]:
texts[0]

" It's time. Let's reveal the 12,024 Shuman era calendar with a big bang. The Cotskzard calendar is special because we're adding 10,000 years to include all of humanity no matter their culture or origin. 12,000 years ago, humans first started working together on a larger scale, laying the foundation for civilization and the future of us all, a much better representation of how far our species has come. This year, we're looking at the cosmos and all the possible life that could be thriving on myriots of worlds, hosting radically different animals and beings from the ones on Earth. Where in the universe might life exist and how would different cosmic environments affect the basic rules of life? Join us in 12,024 and find out. Explore strange cosmic habitats and extraterrestrial life forms on 12 or inspiring pages. There's also lots of room for you to keep track of your life here on Earth. Add some serious space magnificence to your home, but be careful. The extremely radiant cover may bl

### Topic modelling

Understand what the videos are about and make sure it all makes sense.

In [63]:
# Sentencize the transcripts and track their titles
docs = []
titles = []
for text, title in zip(texts, video_titles[:10]):
    sentences = sent_tokenize(text)
    docs.extend(sentences)
    titles.extend([title] * len(sentences))

In [65]:
# Create embeddings from the documents
embeddings = sentence_model.encode(docs)

In [66]:
# Define sub-models
vectorizer = CountVectorizer(stop_words="english")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=2, metric='euclidean', cluster_selection_method='eom')

In [69]:
# Train our topic model with BERTopic
topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer
).fit(docs, embeddings)

In [70]:
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,68,-1_know_let_pen_paradox,"[know, let, pen, paradox, better, time, today,...",[We explained these specialized superweapons i...
1,0,131,0_cells_immune_virus_smallpox,"[cells, immune, virus, smallpox, viruses, vary...","[First, responder immune cells invade the tumo..."
2,1,118,1_time_humanity_wait_make,"[time, humanity, wait, make, difference, happe...",[And what are the best ways to make a positive...
3,2,109,2_brilliant_science_lessons_free,"[brilliant, science, lessons, free, videos, da...","[To get hands-on with cuts, cut-sucks lessons ..."
4,3,108,3_population_people_fertility_rates,"[population, people, fertility, rates, young, ...",[China's working-age population is predicted t...
5,4,79,4_life_years_universe_earth,"[life, years, universe, earth, big, billion, b...",[Where in the universe might life exist and ho...
6,5,71,5_black_hole_holes_charge,"[black, hole, holes, charge, mass, energy, mat...","[Anti-black hole., Since a black hole has mass..."
7,6,65,6_quasars_stars_galaxies_galaxy,"[quasars, stars, galaxies, galaxy, gas, quasar...",[But these enormous distances meant that quasa...
8,7,56,7_nuclear_launch_minutes_war,"[nuclear, launch, minutes, war, missiles, miss...",[Our silent launch sequence takes five minutes...
9,8,50,8_singularity_horizon_dark_energy,"[singularity, horizon, dark, energy, future, e...",[So the singularity is actually in your future...


In [71]:
# Manually selected some interesting topics to prevent information overload
topics_of_interest = [33, 1, 8, 9, 0, 30, 27, 19, 16, 
                      28, 44, 11, 21, 23, 26, 2, 37, 34, 3, 4, 5,
                      15, 17, 22, 38]

# I added the title to the documents themselves for easier interactivity
adjusted_docs = ["<b>" + title + "</b><br>" + doc[:100] + "..." 
                 for doc, title in zip(docs, titles)]

# Visualize documents
topic_model.visualize_documents(
    adjusted_docs, 
    embeddings=embeddings, 
    hide_annotations=False, 
    topics=topics_of_interest,
    custom_labels=True
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

### Vectorise data and save in Vector Store

We could choose any vector store, but Redis sounds like a good idea for persistent and fast Prod environments.

In [78]:
import pandas as pd
import numpy as np
import redis
from redis.commands.search.field import (
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

In [80]:
# assumes we do have a local Redis runinng with redissaerch and reasisjson modules loaded
host = "localhost"
port = 6379
def create_client() -> redis.client.Redis:
    client = redis.Redis(host=host, port=port, password='*****',
                         db=0, decode_responses=True)
    return client

In [81]:
client = create_client()
client.ping()

True

In [83]:
def danger_flushall(client: redis.client.Redis):
    # WARNING/DANGER: delete ALL docs and indexes !!
    client.flushall()

In [84]:
danger_flushall(client)

In [85]:
client.keys()

[]

In [92]:
def save_docs(client: redis.client.Redis, docs: list, prefix: str, idx_start: int = 1):
    pipeline = client.pipeline()
    for i, doc in enumerate(docs, start=idx_start):
        doc = {
            "document": doc
        }
        redis_key = f"{prefix}:{i:05}"
        pipeline.json().set(redis_key, "$", doc)
    res = pipeline.execute()
    return res

In [93]:
_ = save_docs(client, docs, "lex")

In [94]:
client.keys()[:5]

['lex:00005', 'lex:00782', 'lex:00953', 'lex:00038', 'lex:00791']

In [95]:
client.json().get("lex:00005")

{'document': "This year, we're looking at the cosmos and all the possible life that could be thriving on myriots of worlds, hosting radically different animals and beings from the ones on Earth."}

In [97]:
embedder = SentenceTransformer('all-mpnet-base-v2')
vector_dim = len(embedder.encode('hello world'))
print(vector_dim)

768


In [100]:
def create_index(client: redis.client.Redis,
                 key_prefix: str,
                 vector_dim: int) -> tuple:
    idx_name = f"idx:{key_prefix}_vss"
    schema = (
        TextField("$.document", as_name="document"),
        VectorField(
            "$.document_embeddings",
            "FLAT",
            {
                "TYPE": "FLOAT32",
                "DIM": vector_dim,
                "DISTANCE_METRIC": "COSINE",
            },
            as_name="vector",
        ),
    )
    definition = IndexDefinition(prefix=[f"{key_prefix}:"], index_type=IndexType.JSON)
    res = client.ft(idx_name).create_index(
        fields=schema, definition=definition
    )
    return res, idx_name

In [26]:
create_index(client, "lex", vector_dim)

In [27]:
embeddings = embedder.encode(docs)

In [106]:
embeddings = embeddings.astype(np.float32).tolist()

In [111]:
keys = sorted(client.keys("lex:*"))
documents = client.json().mget(keys, "$.document")
documents = [item for sublist in documents for item in sublist]

In [96]:
def save_embeddings(client: redis.client.Redis, keys: list, embeddings: list):
    pipeline = client.pipeline()
    for key, embedding in zip(keys, embeddings):
        pipeline.json().set(key, "$.document_embeddings", embedding)
    return pipeline.execute()

In [115]:
_ = save_embeddings(client, keys, embeddings)

### Query Redis

In [118]:
def create_query(top_k: int = 5, condition: str = "*", max_results: int = 100):
    query = (
        Query(f"({condition})=>[KNN {top_k} @vector $query_vector AS vector_score]")
              .sort_by("vector_score")
              .return_fields("vector_score", "id", "document")
              .dialect(2)
    ).paging(0, max_results)  # by default Redis uses 10 as max results
    return query


def find_similar_vectors(client: redis.client.Redis, key_prefix: str, query: Query,
                         queries: list, encoded_queries: list, extra_params: dict = {}):
    results_list = []
    for i, encoded_query in enumerate(encoded_queries):
        result_docs = (
            client.ft(f"idx:{key_prefix}_vss")
            .search(
                query,
                {
                    "query_vector": np.array(
                        encoded_query, dtype=np.float32
                    ).tobytes()
                }
                | extra_params,
            )
            .docs
        )
        for doc in result_docs:
            vector_score = round(1 - float(doc.vector_score), 2)
            results_list.append(
                {
                    "query": queries[i],
                    "score": vector_score,
                    "id": doc.id,
                    "document": doc.document,
                }
            )
    return results_list

In [135]:
queries = ["What is the meaning of life?"]
encoded_queries = embedder.encode(queries)

In [136]:
q = create_query()

In [137]:
results = find_similar_vectors(client, "lex", q, queries, encoded_queries)

In [138]:
import pandas as pd

In [139]:
pd.DataFrame(results).head()["document"].values[0]

'We have only one life to explore, be free, travel, have fun, accomplish something and try to be happy.'

In [140]:
queries = ["What is the MPN of most recent iPhone 15 Pro?"]
encoded_queries = embedder.encode(queries)

In [141]:
q = create_query()
results = find_similar_vectors(client, "lex", q, queries, encoded_queries)
pd.DataFrame(results).head()

Unnamed: 0,query,score,id,document
0,What is the MPN of most recent iPhone 15 Pro?,0.36,lex:00740,Imagine that your local computer repair shop c...
1,What is the MPN of most recent iPhone 15 Pro?,0.27,lex:00314,"At this point, so far in the future that givin..."
2,What is the MPN of most recent iPhone 15 Pro?,0.18,lex:00152,"In 2023, it's 45."
3,What is the MPN of most recent iPhone 15 Pro?,0.17,lex:00805,We put a link and further reading in the descr...
4,What is the MPN of most recent iPhone 15 Pro?,0.17,lex:00595,10 billion trillion times the present age of t...


In [142]:
pd.DataFrame(results).head()["document"].values[0]

'Imagine that your local computer repair shop could build a pristine iPhone 11 with just the parts lying around and that teenagers are asked to build a new iPhone 5 for homework.'

### Next - we could connect the query results to the LLM

This way we achieve a RAG-style app.