#HubermanLab Chatbot !

## Installing and importing necessary libraries

In [1]:
!pip install -U langchain-community
!pip install langchain_openai
!pip install langchain_chroma



In [9]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from google.colab import drive, userdata
from openai import OpenAI
import re

## Now, initialize and setup up constants

In [4]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [6]:
# Mount Google Drive (run this first if you want to save to Drive)

drive.mount("/content/drive")

Mounted at /content/drive


In [14]:
# Sign in to OpenAI using Secrets in Colab

openai_api_key = userdata.get('OPENAI_API_KEY')

## Time to load the summaries into chunks using LangChain utilities

In [12]:
folder_path = "/content/drive/MyDrive/Podcasts/HubermanLab"

# 3. Function to add metadata
def add_description_metadata(doc):
    # Get the filename without extension
    filename = os.path.basename(doc.metadata["source"])
    name, _ = os.path.splitext(filename)

    # Regex to remove "Episode_###_Essentials"
    # Captures everything after that pattern
    match = re.match(r"Episode_\d+_Essentials\s*(.*)", name)
    if match:
        description = match.group(1).strip()
    else:
        description = name  # fallback if pattern doesn't match

    # Add to metadata
    doc.metadata["description"] = description if description else "General"
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}

# 4. Load ONLY markdown files directly inside folder_path
loader = DirectoryLoader(
    folder_path,
    glob="*.md",  # only md files, no recursion
    loader_cls=TextLoader,
    loader_kwargs=text_loader_kwargs
)

documents = loader.load()

documents = [add_description_metadata(doc) for doc in documents]

# 5. Split into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['description'] for doc in documents)}")



Total number of chunks: 290
Document types found: {'Science of Mindsets for Health & Performance Dr. Alia Crum', 'How to Control Hunger, Eating & Satiety', 'How to Control Your Sense of Pain & Pleasure', 'Science of Stress, Testosterone, Aggression & Motivation Dr. Robert Sapolsky', 'Control Pain & Heal Faster With Your Brain', 'How Smell, Taste & Pheromones Shape Behavior', 'How to Increase Motivation & Drive', 'Maximize Productivity, Physical & Mental Health With Daily Tools', 'Using Science to Optimize Sleep, Learning & Metabolism', 'Build Muscle Size, Increase Strength & Improve Recovery', 'The Science of Emotions & Relationships', 'How to Defeat Jet Lag, Shift Work & Sleeplessness', 'Psychedelics for Treating Mental Disorders Dr. Matthew Johnson', 'Understanding & Healing the Mind Dr. Karl Deisseroth', 'Understanding & Treating Addiction Dr. Anna Lembke', 'How Foods & Nutrients Control Our Moods', 'Protocols to Improve Vision & Eyesight', 'How to Focus to Change Your Brain', 'Mast

## Using OpenAIEmbeddings and Chroma, we create now the vector database from the chunks we defined before.

In [17]:
embeddings = OpenAIEmbeddings(api_key=openai_api_key)

# Use the same folder as your documents for persistence
db_path = os.path.join(folder_path, "chroma_db")

# If DB already exists → load it
if os.path.exists(db_path):
    print("Loading existing Chroma vectorstore...")
    vectorstore = Chroma(
        persist_directory=db_path,
        embedding_function=embeddings
    )
else:
    print("Creating new Chroma vectorstore...")
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=db_path
    )


print(f"Vectorstore contains {vectorstore._collection.count()} documents")

Loading existing Chroma vectorstore...
Vectorstore contains 290 documents


In [18]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 290 vectors with 1,536 dimensions in the vector store


## Now we will bring this up in Gradio using the Chat interface...


In [31]:
# create a new Chat with OpenAI
llm = ChatOpenAI(api_key=openai_api_key,temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

# Precompute all descriptions from the vectorstore
all_docs = vectorstore._collection.get()["metadatas"]
descriptions = sorted(set(doc.get("description", "General") for doc in all_docs))

# Helper: format topics with numbers
def format_topics_numbered(topics):
    return "\n".join(f"{i+1}. {t}" for i, t in enumerate(topics))

# Helper: filter topics by keyword
def filter_topics(keyword):
    keyword = keyword.lower().strip()
    filtered = [t for t in descriptions if keyword in t.lower()]
    if not filtered:
        return "No topics found for that keyword."
    return "Matching topics:\n" + format_topics_numbered(filtered)

# Main chat function
def chat(question, history):
    q = question.strip().lower()

    # 1️⃣ Greeting detection
    greetings = ["hello", "hi", "hey", "hola", "good morning", "good afternoon"]
    if any(greet in q for greet in greetings):
        return 'Hello! Type "show topics" to see all available podcast topics or type "topics apples" to see all topics with apples.'

    # 2️⃣ Show all topics
    if q in ["show topics", "topics"]:
        return "Available topics:\n" + format_topics_numbered(descriptions)

    # 3️⃣ Filter topics by keyword
    if q.startswith("topics "):
        keyword = q[len("topics "):].strip()
        return filter_topics(keyword)

    # 4️⃣ Numbered topic selection
    match = re.search(r"\b(\d+)(st|nd|rd|th)?\b", q)
    if match:
        idx = int(match.group(1)) - 1
        if 0 <= idx < len(descriptions):
            topic = descriptions[idx]

            # Filter retriever to the selected topic
            retriever = vectorstore.as_retriever(
                search_kwargs={"k": 10, "filter": {"description": topic}}
            )
            conversation_chain_with_topic = ConversationalRetrievalChain.from_llm(
                llm=llm, retriever=retriever, memory=memory
            )

            # Use the topic itself as the query for retrieval
            result = conversation_chain_with_topic.invoke({"question": topic})
            return f"(Topic: {topic})\n" + result["answer"]
        else:
            return f"Invalid topic number. Please pick between 1 and {len(descriptions)}."

    # 5️⃣ Normal RAG retrieval
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

view = gr.ChatInterface(chat, type="messages", title="HubermanLab Chatbot").launch(inbrowser=True)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5288297ce8c64c8ffc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
