Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ brain = [
"leidenalg>=0.10.0", # Leiden community detection (replaces graspologic)
"umap-learn>=0.5.0", # UMAP 3D layout
]
youtube = [
"yt-dlp>=2024.1.0", # YouTube video metadata + subtitle download
"youtube-transcript-api>=1.2.4", # Primary transcript API (instance-based)
"certifi", # SSL certificates for macOS
]
Comment on lines +60 to +64
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

certifi lacks a version constraint.

All other packages in this group specify a minimum version. certifi without any bound can pull in any release, including future ones with breaking changes, reducing reproducibility.

📌 Proposed fix
-    "certifi",  # SSL certificates for macOS
+    "certifi>=2023.0.0",  # SSL certificates for macOS
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
youtube = [
"yt-dlp>=2024.1.0", # YouTube video metadata + subtitle download
"youtube-transcript-api>=1.2.4", # Primary transcript API (instance-based)
"certifi", # SSL certificates for macOS
]
youtube = [
"yt-dlp>=2024.1.0", # YouTube video metadata + subtitle download
"youtube-transcript-api>=1.2.4", # Primary transcript API (instance-based)
"certifi>=2023.0.0", # SSL certificates for macOS
]
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pyproject.toml` around lines 60 - 64, The certifi entry in the youtube
dependency list has no version constraint; update the pyproject.toml so the
"youtube" array's "certifi" element includes a minimum version (e.g., change
"certifi" to a pinned minimum like "certifi>=<appropriate-version>") to match
the other dependencies and ensure reproducible installs; locate the "youtube"
list in pyproject.toml and modify the "certifi" item accordingly.

ast = [
"tree-sitter-languages>=1.9.0", # Better code chunking (falls back to line-based if unavailable)
]
Expand Down
77 changes: 69 additions & 8 deletions scripts/index_youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@
from typing import Any

import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi

# Add brainlayer to path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))

from brainlayer.embeddings import embed_chunks
from brainlayer.paths import DEFAULT_DB_PATH
from brainlayer.pipeline.chunk import Chunk
from brainlayer.pipeline.classify import ContentType, ContentValue
from brainlayer.vector_store import VectorStore
Expand All @@ -44,29 +46,77 @@
)
log = logging.getLogger(__name__)

DEFAULT_DB = Path.home() / ".local" / "share" / "brainlayer" / "brainlayer.db"
DELAY_BETWEEN_VIDEOS = 10 # seconds, to avoid rate limiting
TARGET_CHUNK_CHARS = 1200 # ~300-400 tokens for bge-large (512 token limit)
CHUNK_OVERLAP_CHARS = 200


# ---------------------------------------------------------------------------
# Transcript extraction via yt-dlp
# Transcript extraction via youtube-transcript-api (primary — avoids yt-dlp 429s)
# ---------------------------------------------------------------------------

_yt_api = YouTubeTranscriptApi() # Reuse single instance across calls
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

_yt_api doesn't benefit from the certifi SSL fix it's supposed to use.

The PR explicitly adds certifi to fix macOS SSL certificate errors. However, _yt_api uses requests internally with the default system SSL context. The certifi-backed context is only applied to the urllib.request.urlopen calls in _fetch_json3_transcript / _fetch_vtt_transcript. On macOS without proper certificates, the primary API path will still fail with SSL errors.

The library supports passing a custom requests.Session via http_client to fix this:

🛡️ Proposed fix to wire certifi into the primary API path
-_yt_api = YouTubeTranscriptApi()  # Reuse single instance across calls
+try:
+    import certifi
+    from requests import Session as _Session
+    _http = _Session()
+    _http.verify = certifi.where()
+    _yt_api = YouTubeTranscriptApi(http_client=_http)
+except ImportError:
+    _yt_api = YouTubeTranscriptApi()
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/index_youtube.py` at line 58, The YouTubeTranscriptApi instance
(_yt_api) is still using the system SSL context and ignores the certifi fix;
create a requests.Session configured to use certifi (set session.verify =
certifi.where()) and pass that session into YouTubeTranscriptApi via its
http_client parameter when instantiating _yt_api so the primary API path uses
the certifi-backed SSL; reuse this session for the singleton _yt_api and keep
the existing urllib.request-based certifi usage in
_fetch_json3_transcript/_fetch_vtt_transcript unchanged.



def get_transcript_via_api(video_id: str) -> list[dict] | None:
"""Fetch transcript using youtube-transcript-api (v1.2.4+).

Uses a different internal YouTube endpoint than yt-dlp,
so it often works when yt-dlp subtitle downloads are 429'd.
Returns list of {"text": str, "start": float, "duration": float}.
"""
try:
transcript_list = _yt_api.list(video_id)
transcript = None
try:
transcript = transcript_list.find_manually_created_transcript(["en"])
except Exception:
try:
transcript = transcript_list.find_generated_transcript(["en"])
except Exception:
pass
Comment on lines +68 to +77
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Bare except Exception in the inner blocks masks network errors and other non-transcript-availability failures.

If find_manually_created_transcript raises a network error (e.g., ConnectionError, Timeout), execution silently falls through to find_generated_transcript instead of propagating the real failure. The correct exceptions to catch are NoTranscriptFound (and optionally TranscriptsDisabled), available via from youtube_transcript_api import NoTranscriptFound.

Additionally, since find_transcript already prefers manually created transcripts over generated ones by default, the entire nested try/except can be collapsed significantly.

♻️ Proposed refactor
+from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound
 
 def get_transcript_via_api(video_id: str) -> list[dict] | None:
     try:
-        transcript_list = _yt_api.list(video_id)
-        transcript = None
-        try:
-            transcript = transcript_list.find_manually_created_transcript(["en"])
-        except Exception:
-            try:
-                transcript = transcript_list.find_generated_transcript(["en"])
-            except Exception:
-                pass
-
-        if not transcript:
-            return None
-
-        fetched = transcript.fetch()
+        fetched = _yt_api.fetch(video_id, languages=["en"])
         segments = []
         for entry in fetched:

If you need to distinguish manual vs. generated for logging, use NoTranscriptFound specifically:

+        try:
+            transcript = transcript_list.find_manually_created_transcript(["en"])
+        except NoTranscriptFound:
+            transcript = transcript_list.find_generated_transcript(["en"])
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/index_youtube.py` around lines 68 - 77, The nested bare "except
Exception" blocks around transcript_list.find_manually_created_transcript and
find_generated_transcript mask real errors; update the code to either call
transcript_list.find_transcript (which already prefers manual over generated) or
explicitly catch only youtube_transcript_api exceptions like NoTranscriptFound
(and optionally TranscriptsDisabled) from "from youtube_transcript_api import
NoTranscriptFound, TranscriptsDisabled", then handle NoTranscriptFound by
falling back to the generated transcript or logging, but let other exceptions
(e.g., ConnectionError/Timeout) propagate; locate references to _yt_api.list,
transcript_list.find_manually_created_transcript,
transcript_list.find_generated_transcript, and transcript_list.find_transcript
to implement this change.


if not transcript:
return None

fetched = transcript.fetch()
segments = []
for entry in fetched:
text = entry.text.strip()
if text and text not in ("[Music]", "[Applause]", "[Laughter]"):
segments.append({
"text": text,
"start": entry.start,
"duration": entry.duration,
})
return segments if segments else None
except Exception as e:
log.warning(f" youtube-transcript-api failed: {e}")
return None


# ---------------------------------------------------------------------------
# Transcript extraction via yt-dlp (fallback)
# ---------------------------------------------------------------------------

def extract_video_info(video_url: str) -> dict[str, Any] | None:
"""Extract metadata + subtitles for a single video."""
"""Extract metadata + subtitles for a single video.

Uses Brave browser cookies to bypass YouTube IP bans on subtitle requests.
Uses process=False to skip format selection (avoids 'format not available' errors).
"""
opts = {
"skip_download": True,
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["en", "en-orig", "en.*"],
"quiet": True,
"no_warnings": True,
"cookiesfrombrowser": ("brave",),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Hardcoded cookiesfrombrowser: ("brave",) silently breaks all yt-dlp operations on non-Brave systems.

Every yt-dlp code path — extract_video_info, get_transcript_via_download, list_channel_videos, and list_playlist_videos — hardcodes Brave. yt-dlp supports multiple browsers including chrome, chromium, edge, firefox, opera, safari, vivaldi, and whale, but a system without Brave installed will raise an error during cookie extraction. The downstream effects are:

  • extract_video_info → catches exception → returns Nonevideo_id = "unknown", no metadata
  • get_transcript_via_download → catches exception → returns None (expected fallback)
  • list_channel_videos / list_playlist_videos → catches exception → returns [], causing the entire channel/playlist batch run to process zero videos with no actionable error

The last point is the most severe: the tool appears to succeed but indexes nothing when run against a channel/playlist on a machine without Brave.

🔧 Proposed fix — make Brave optional with graceful fallback

Extract cookie opts into a helper and fall back to no cookies:

+def _cookie_opts() -> dict:
+    """Returns cookiesfrombrowser opts if Brave is available, else empty dict."""
+    import shutil
+    # Quick heuristic: check if a brave binary or profile dir exists
+    # yt-dlp will raise if the profile is missing; we catch that upstream.
+    # Returning the opt unconditionally is also fine since all callers already
+    # have try/except — this comment documents the intentional env-specificity.
+    return {"cookiesfrombrowser": ("brave",)}

Or alternatively, expose a --cookiesfrombrowser CLI flag so the caller can choose their browser:

+parser.add_argument(
+    "--cookies-from-browser",
+    metavar="BROWSER",
+    default=None,
+    help="Browser to extract cookies from (brave/chrome/firefox/etc). Default: none.",
+)

and pass {"cookiesfrombrowser": (args.cookies_from_browser,)} if args.cookies_from_browser else {} to yt-dlp opts.

Also applies to: 144-144, 320-320, 347-347

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/index_youtube.py` at line 115, The hardcoded cookiesfrombrowser
option ("brave") causes failures on systems without Brave; update
scripts/index_youtube.py so cookie options are optional and gracefully fall back
to no cookies: extract the cookie option into a helper (e.g., build_cookie_opts
or get_cookie_opts) and return {"cookiesfrombrowser": (browser,)} only when a
browser is configured/available (or when a new CLI flag
args.cookies_from_browser is set), otherwise return {}; then use that helper in
extract_video_info, get_transcript_via_download, list_channel_videos and
list_playlist_videos so yt-dlp calls get an empty options dict instead of always
forcing "brave".

}
try:
with yt_dlp.YoutubeDL(opts) as ydl:
return ydl.extract_info(video_url, download=False)
return ydl.extract_info(video_url, download=False, process=False)
except Exception as e:
log.warning(f"Failed to extract {video_url}: {e}")
return None
Comment on lines 102 to 122
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, check the repository structure and find yt-dlp integration
find . -type f -name "*.py" | grep -E "(youtube|yt_dlp)" | head -20

Repository: EtanHey/brainlayer

Length of output: 88


🏁 Script executed:

# Read the full context of scripts/index_youtube.py to understand how extract_video_info is used
wc -l scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 90


🏁 Script executed:

# Look at the get_transcript_from_info function at line 543
sed -n '540,570p' scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 1018


🏁 Script executed:

# Check if yt-dlp is vendored or external
find . -type d -name "yt_dlp" | head -5

Repository: EtanHey/brainlayer

Length of output: 44


🏁 Script executed:

# Look for requirements or dependencies
find . -type f -name "requirements*.txt" -o -name "setup.py" -o -name "pyproject.toml" | xargs grep -l "yt-dlp" 2>/dev/null

Repository: EtanHey/brainlayer

Length of output: 78


🏁 Script executed:

# Check pyproject.toml for yt-dlp dependency
cat pyproject.toml | grep -A5 -B5 "yt-dlp"

Repository: EtanHey/brainlayer

Length of output: 568


🏁 Script executed:

# Look at the full get_transcript_from_info function
sed -n '543,600p' scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 1917


🏁 Script executed:

# Check how extract_video_info is called and what info is used
grep -n "extract_video_info" scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 175


🏁 Script executed:

# Get more context around the function at lines 102-122 and 543
sed -n '95,130p' scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 1342


🏁 Script executed:

# Find the get_transcript_from_info function definition
grep -n "def get_transcript_from_info" scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 128


🏁 Script executed:

# Get the full implementation of get_transcript_from_info
sed -n '470,510p' scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 1493


🏁 Script executed:

# Check if there are any other references or documentation about process=False
grep -n "process" scripts/index_youtube.py | head -20

Repository: EtanHey/brainlayer

Length of output: 301


🏁 Script executed:

# Look at the imports to understand what yt_dlp is being used
head -50 scripts/index_youtube.py | grep -E "^import|^from"

Repository: EtanHey/brainlayer

Length of output: 506


🏁 Script executed:

# Get the full implementation of get_transcript_from_info
sed -n '170,220p' scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 1992


🏁 Script executed:

# Check how the info dict is used in the entire flow
grep -n "info\[" scripts/index_youtube.py | head -20

Repository: EtanHey/brainlayer

Length of output: 44


🏁 Script executed:

# Search for how extract_info result is used elsewhere
grep -n "\.extract_info" scripts/index_youtube.py -A5

Repository: EtanHey/brainlayer

Length of output: 779


🏁 Script executed:

# Check the full context of the index_video function where extract_video_info is called
sed -n '505,560p' scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 1877


🌐 Web query:

yt-dlp process=False subtitles automatic_captions raw extractor output

💡 Result:

In the Python API, YoutubeDL.extract_info(..., process=False) returns the raw extractor result (i.e., it does not “resolve” / post-process URL references, playlist items, etc.). yt-dlp’s own docstring describes process as: “resolve all unresolved references (URLs, playlist items)” and notes it “must be True for download to work.” [1]

What you get for subtitles / automatic captions in “raw extractor output”

Even with process=False, extractors commonly include these keys in the returned dict:

  • info_dict["subtitles"]: human-provided subtitles (if any)
  • info_dict["automatic_captions"]: auto-generated captions (if any)

These are typically shaped like:

{
  "en": [{"ext": "vtt", "url": "...", ...}, {"ext": "ttml", "url": "...", ...}],
  "ja": [...],
}

Why process=False can change what you see/receive

Subtitle selection/merging (e.g., honoring writesubtitles, writeautomaticsub, subtitleslangs, regex selection, etc.) is performed by yt-dlp’s subtitle processing logic (it merges normal subs + auto subs depending on params, then filters to requested languages). [2] If you’re trying to see exactly what the extractor surfaced, process=False is appropriate; if you want yt-dlp to apply your subtitle options, use the default process=True.

Minimal example: “raw extractor output” for subs/captions

import yt_dlp

with yt_dlp.YoutubeDL({"skip_download": True}) as ydl:
    info = ydl.extract_info(URL, download=False, process=False)
    subs = info.get("subtitles", {})
    autos = info.get("automatic_captions", {})

(Also note yt-dlp itself uses process=False internally when resolving some url_transparent results, specifically to avoid resolving/processing at that step.) [3]

Sources: [1] [3] [2]


🌐 Web query:

yt-dlp YouTube extractor subtitles automatic_captions raw output without process

💡 Result:

To get YouTube auto-captions in a “raw” (YouTube-native) format with yt-dlp (i.e., download as-served, without converting/post-processing), use an auto-sub format like json3 or srv3, and do not use --convert-subs.

1) Download raw auto-captions file (no video)

yt-dlp --skip-download --write-auto-subs --sub-lang en --sub-format json3 URL

--write-auto-subs downloads auto-generated captions, and YouTube auto-captions commonly support vtt, ttml, srv3, srv2, srv1, json3 formats (use json3/srv3 for the most “raw” feel). [1]

2) See what subtitle formats/languages are available (incl. auto)

yt-dlp --list-subs URL

This prints available subtitles/automatic captions and their formats. [2]

3) Get the direct “timedtext” URL (so you can fetch it yourself)

If you want the extractor’s direct caption URL (and then you can curl/wget it), dump metadata and look at automatic_captions:

yt-dlp --dump-json URL

The JSON includes an automatic_captions object when available. [3]

Sources:
[1] yt-dlp issue showing available automatic caption formats (incl. json3, srv3) yt-dlp/yt-dlp#4090
[2] yt-dlp README (subtitle options incl. --list-subs, --write-auto-subs) https://github.com/yt-dlp/yt-dlp
[3] yt-dlp issue discussing --dump-json and automatic_captions field yt-dlp/yt-dlp#5414


🏁 Script executed:

# Check if there's any error handling or logging about missing subtitles
grep -n "subtitles\|automatic_captions" scripts/index_youtube.py

Repository: EtanHey/brainlayer

Length of output: 535


The subtitle options set in opts won't actually be applied with process=False, requiring manual language selection in get_transcript_from_info() to work.

With process=False, yt-dlp skips post-processing including subtitle filtering/selection logic. The options writesubtitles, writeautomaticsub, and subtitleslangs won't filter the output. However, the subtitles and automatic_captions fields are typically included in the raw extractor output. The current implementation handles this by manually iterating through language variants (en, en-orig, en-US, en-GB) in get_transcript_from_info(), which is appropriate. The process=False usage is reasonable since it avoids format selection errors, but the subtitle options set in the opts dict are superfluous and could be removed for clarity.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/index_youtube.py` around lines 102 - 122, The opts dict in
extract_video_info contains subtitle-related keys that are ignored when using
process=False; remove writesubtitles, writeautomaticsub, and subtitleslangs from
opts to avoid confusion and keep only effective options (e.g., skip_download,
quiet, no_warnings, cookiesfrombrowser), while preserving process=False in
ydl.extract_info and relying on the existing language-selection logic in
get_transcript_from_info to pick subtitles from the raw extractor fields.

Expand All @@ -91,6 +141,7 @@ def get_transcript_via_download(video_url: str) -> list[dict] | None:
"outtmpl": f"{tmpdir}/%(id)s.%(ext)s",
"quiet": True,
"no_warnings": True,
"cookiesfrombrowser": ("brave",),
}
try:
with yt_dlp.YoutubeDL(opts) as ydl:
Expand Down Expand Up @@ -266,6 +317,7 @@ def list_channel_videos(channel_id: str) -> list[dict]:
"extract_flat": True,
"quiet": True,
"no_warnings": True,
"cookiesfrombrowser": ("brave",),
}
try:
with yt_dlp.YoutubeDL(opts) as ydl:
Expand All @@ -292,6 +344,7 @@ def list_playlist_videos(playlist_id: str) -> list[dict]:
"extract_flat": True,
"quiet": True,
"no_warnings": True,
"cookiesfrombrowser": ("brave",),
}
try:
with yt_dlp.YoutubeDL(opts) as ydl:
Expand Down Expand Up @@ -481,9 +534,11 @@ def index_single_video(
log.info(f" Title: {title}")
log.info(f" Chapters: {len(chapters)}")

# Get transcript — try download method first (avoids rate limits),
# fall back to URL-based extraction from info dict
segments = get_transcript_via_download(video_url)
# Get transcript — try youtube-transcript-api first (different endpoint,
# avoids yt-dlp 429s), then yt-dlp download, then URL-based extraction
segments = get_transcript_via_api(video_id)
if not segments:
segments = get_transcript_via_download(video_url)
if not segments:
segments = get_transcript_from_info(info)
if not segments:
Expand Down Expand Up @@ -522,6 +577,11 @@ def index_single_video(
if upload_date:
meta["upload_date"] = upload_date

# Convert upload_date (YYYYMMDD) to ISO timestamp for created_at
created_at = None
if upload_date and len(upload_date) == 8:
created_at = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}T00:00:00+00:00"

chunk_data.append({
"id": f"{source_file}:{i}",
"content": c.content,
Expand All @@ -534,6 +594,7 @@ def index_single_video(
"source": "youtube",
"conversation_id": source_file,
"position": i,
"created_at": created_at,
})
embeddings.append(ec.embedding)

Expand All @@ -555,7 +616,7 @@ def main():
parser.add_argument("--channel", help="YouTube channel ID (index all videos)")
parser.add_argument("--playlist", help="YouTube playlist ID")
parser.add_argument("--project", default="huberman", help="BrainLayer project name")
parser.add_argument("--db", type=Path, default=DEFAULT_DB, help="BrainLayer DB path")
parser.add_argument("--db", type=Path, default=DEFAULT_DB_PATH, help="BrainLayer DB path")
parser.add_argument("--dry-run", action="store_true", help="Show what would be indexed")
parser.add_argument("--resume", action="store_true", help="Skip already-indexed videos")
parser.add_argument("--delay", type=float, default=DELAY_BETWEEN_VIDEOS,
Expand Down