In [None]:
# install required packages
!pip install -q transformers peft accelerate bitsandbytes safetensors sentencepiece weaviate-client langchain sentence-transformers tiktoken youtube-transcript-api

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/215.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/215.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# fixing unicode error in google colab
import locale

locale.getpreferredencoding = lambda: "UTF-8"

# import dependencies
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from langchain.text_splitter import TokenTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.vectorstores import Weaviate
import weaviate

In [None]:
WEAVIATE_URL = "WEAVIATE_CLUSTER_URL"
WEAVIATE_API_KEY = "WEAVIATE_API_KEY"

client = weaviate.Client(
    url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY)
)

Local embedding and LLM models
I am most familiar with the LangChain LLM framework, so we will be using it to ingest documents as well as retrieve them. We will be using sentence_transformers/all-mpnet-base-v2 embedding model and zephyr-7b-alpha llm. Both of these models are open source and available on HuggingFace. The implementation code for these two models in LangChain was kindly borrowed from the following repository:

https://github.com/aigeek0x0/zephyr-7b-alpha-langchain-chatbot

We will begin by defining the embedding model, which can be easily retrieved from HuggingFace using the following code:

In [None]:
# specify embedding model (using huggingface sentence transformer)
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name, model_kwargs=model_kwargs
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Ingest HubermanLabs podcasts into Weaviate
I have learned that each channel on YouTube has an RSS feed, that can be used to fetch links to the latest 10 videos. As the RSS feed returns a XML, we need to employ a simple Python script to extract the links.

In [None]:
import requests
import xml.etree.ElementTree as ET

URL = "https://www.youtube.com/feeds/videos.xml?channel_id=UC2D2CMWXMOVWx7giW1n3LIg"

response = requests.get(URL)
xml_data = response.content

# Parse the XML data
root = ET.fromstring(xml_data)

# Define the namespace
namespaces = {
    "atom": "http://www.w3.org/2005/Atom",
    "media": "http://search.yahoo.com/mrss/",
}

# Extract YouTube links
youtube_links = [
    link.get("href")
    for link in root.findall(".//atom:link[@rel='alternate']", namespaces)
][1:]

Now that we have the links to the videos at hand, we can use the YoutubeLoader from LangChain to retrieve the captions. Next, as with most RAG ingestions pipelines, we have to chunk the text into smaller pieces before ingestion. We can use the text splitter functionality that is built into LangChain.

In [None]:
from langchain.document_loaders import YoutubeLoader

all_docs = []
for link in youtube_links:
    loader = YoutubeLoader.from_youtube_url(link)
    docs = loader.load()
    all_docs.extend(docs)
text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=0)
split_docs = text_splitter.split_documents(all_docs)

vector_db = Weaviate.from_documents(
    split_docs, embeddings, client=client, by_text=False
)

You can test the vector retriever using the following code:



In [None]:
vector_db.similarity_search(
    "Which are tools to bolster your mental health?", k=3)


[Document(page_content=" agree as a species that we are most interested in and we are going to specifically fund research that is aimed toward developing further protocols for mental health physical health and performance and those protocols will be distributed through all channels not just the premium channel but through all channels hubman Lab podcast and other media channels so the idea here is to give you information to your burning questions in depth and allow you the opportunity to support the kind of research that provides those kinds of answers in the first place if you'd like to sign up for the hum lab premium channel again there's a cost of $10 per month or you can pay $100 Upfront for the entire", metadata={'source': 'jGZ1mR9uLU0'}),
 Document(page_content=' hundreds of guided meditations mindfulness trainings yoga needer sessions and more I started meditating over three decades ago and what I found in the ensuing years is that sometimes it was very easy for me to do my dail

## Setting up a local LLM
This part of the code was completely copied from the example provided by the AI Geek. It loads the `zephyr-7b-alpha-sharded` model and its tokenizer from HuggingFace and loads it as a LangChain LLM module.

In [None]:
# specify model huggingface mode name
model_name = "anakin87/zephyr-7b-alpha-sharded"

# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

# function for initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

# initialize tokenizer
tokenizer = initialize_tokenizer(model_name)
# load model
model = load_quantized_model(model_name)
# specify stop token ids
stop_token_ids = [0]

# build huggingface pipeline for using zephyr-7b-alpha
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

# specify the llm
llm = HuggingFacePipeline(pipeline=pipeline)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

I haven’t played around yet, but you could probably reuse this code to load other LLMs from HuggingFace.

## Building a conversation chain
Now that we have our vector retrieval and th LLM ready, we can implement a retrieval-augmented chatbot in only a couple lines of code.

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vector_db.as_retriever()
)

Let’s now test how well it works:



In [None]:
response = qa_chain.run(
    "How does one increase their mental health?")
print(response)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

 agree as a species that we are most interested in and we are going to specifically fund research that is aimed toward developing further protocols for mental health physical health and performance and those protocols will be distributed through all channels not just the premium channel but through all channels hubman Lab podcast and other media channels so the idea here is to give you information to your burning questions in depth and allow you the opportunity to support the kind of research that provides those kinds of answers in the first place if you'd like to sign up for the hum lab premium channel again there's a cost of $10 per month or you can pay $100 Upfront for the entire

 quality therapy can improve our mental health and the overall landscape of our lives in fact I view quality therapy as important as physical e

Let’s try another one:



In [None]:
response = qa_chain.run("How to increase your willpower?")
print(response)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

 learn how to be a lot better at pursuing whatever goals will help us survive in our environment and the brain has to be adaptive to that environmental input because the environment's always changing if it had only one way of pursuing its goals then we would never survive so it has to be the case that the planning rational observing part of the brain is actually responsive to what works in your context for goal Pursuit so again I'm summarizing other people's here but that's how I that's how I see it yeah I completely agree that emotions Drive the more um let's call tactical circuitry of the prefrontal cortex of course we should be fair to the the

 a necessary level of demand for whatever goal you have with the perceptions of the resources and sometimes those resources are your internal like just confidence you know or somet