-
Notifications
You must be signed in to change notification settings - Fork 7
feat(youtube): transcript-api v1.2.4 + Brave cookies #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,11 +28,13 @@ | |
| from typing import Any | ||
|
|
||
| import yt_dlp | ||
| from youtube_transcript_api import YouTubeTranscriptApi | ||
|
|
||
| # Add brainlayer to path | ||
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) | ||
|
|
||
| from brainlayer.embeddings import embed_chunks | ||
| from brainlayer.paths import DEFAULT_DB_PATH | ||
| from brainlayer.pipeline.chunk import Chunk | ||
| from brainlayer.pipeline.classify import ContentType, ContentValue | ||
| from brainlayer.vector_store import VectorStore | ||
|
|
@@ -44,29 +46,77 @@ | |
| ) | ||
| log = logging.getLogger(__name__) | ||
|
|
||
| DEFAULT_DB = Path.home() / ".local" / "share" / "brainlayer" / "brainlayer.db" | ||
| DELAY_BETWEEN_VIDEOS = 10 # seconds, to avoid rate limiting | ||
| TARGET_CHUNK_CHARS = 1200 # ~300-400 tokens for bge-large (512 token limit) | ||
| CHUNK_OVERLAP_CHARS = 200 | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Transcript extraction via yt-dlp | ||
| # Transcript extraction via youtube-transcript-api (primary — avoids yt-dlp 429s) | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| _yt_api = YouTubeTranscriptApi() # Reuse single instance across calls | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The PR explicitly adds The library supports passing a custom 🛡️ Proposed fix to wire certifi into the primary API path-_yt_api = YouTubeTranscriptApi() # Reuse single instance across calls
+try:
+ import certifi
+ from requests import Session as _Session
+ _http = _Session()
+ _http.verify = certifi.where()
+ _yt_api = YouTubeTranscriptApi(http_client=_http)
+except ImportError:
+ _yt_api = YouTubeTranscriptApi()🤖 Prompt for AI Agents |
||
|
|
||
|
|
||
| def get_transcript_via_api(video_id: str) -> list[dict] | None: | ||
| """Fetch transcript using youtube-transcript-api (v1.2.4+). | ||
|
|
||
| Uses a different internal YouTube endpoint than yt-dlp, | ||
| so it often works when yt-dlp subtitle downloads are 429'd. | ||
| Returns list of {"text": str, "start": float, "duration": float}. | ||
| """ | ||
| try: | ||
| transcript_list = _yt_api.list(video_id) | ||
| transcript = None | ||
| try: | ||
| transcript = transcript_list.find_manually_created_transcript(["en"]) | ||
| except Exception: | ||
| try: | ||
| transcript = transcript_list.find_generated_transcript(["en"]) | ||
| except Exception: | ||
| pass | ||
|
Comment on lines
+68
to
+77
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Bare If Additionally, since ♻️ Proposed refactor+from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound
def get_transcript_via_api(video_id: str) -> list[dict] | None:
try:
- transcript_list = _yt_api.list(video_id)
- transcript = None
- try:
- transcript = transcript_list.find_manually_created_transcript(["en"])
- except Exception:
- try:
- transcript = transcript_list.find_generated_transcript(["en"])
- except Exception:
- pass
-
- if not transcript:
- return None
-
- fetched = transcript.fetch()
+ fetched = _yt_api.fetch(video_id, languages=["en"])
segments = []
for entry in fetched:If you need to distinguish manual vs. generated for logging, use + try:
+ transcript = transcript_list.find_manually_created_transcript(["en"])
+ except NoTranscriptFound:
+ transcript = transcript_list.find_generated_transcript(["en"])🤖 Prompt for AI Agents |
||
|
|
||
| if not transcript: | ||
| return None | ||
|
|
||
| fetched = transcript.fetch() | ||
| segments = [] | ||
| for entry in fetched: | ||
| text = entry.text.strip() | ||
| if text and text not in ("[Music]", "[Applause]", "[Laughter]"): | ||
| segments.append({ | ||
| "text": text, | ||
| "start": entry.start, | ||
| "duration": entry.duration, | ||
| }) | ||
| return segments if segments else None | ||
| except Exception as e: | ||
| log.warning(f" youtube-transcript-api failed: {e}") | ||
| return None | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Transcript extraction via yt-dlp (fallback) | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| def extract_video_info(video_url: str) -> dict[str, Any] | None: | ||
| """Extract metadata + subtitles for a single video.""" | ||
| """Extract metadata + subtitles for a single video. | ||
|
|
||
| Uses Brave browser cookies to bypass YouTube IP bans on subtitle requests. | ||
| Uses process=False to skip format selection (avoids 'format not available' errors). | ||
| """ | ||
| opts = { | ||
| "skip_download": True, | ||
| "writesubtitles": True, | ||
| "writeautomaticsub": True, | ||
| "subtitleslangs": ["en", "en-orig", "en.*"], | ||
| "quiet": True, | ||
| "no_warnings": True, | ||
| "cookiesfrombrowser": ("brave",), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hardcoded Every yt-dlp code path —
The last point is the most severe: the tool appears to succeed but indexes nothing when run against a channel/playlist on a machine without Brave. 🔧 Proposed fix — make Brave optional with graceful fallbackExtract cookie opts into a helper and fall back to no cookies: +def _cookie_opts() -> dict:
+ """Returns cookiesfrombrowser opts if Brave is available, else empty dict."""
+ import shutil
+ # Quick heuristic: check if a brave binary or profile dir exists
+ # yt-dlp will raise if the profile is missing; we catch that upstream.
+ # Returning the opt unconditionally is also fine since all callers already
+ # have try/except — this comment documents the intentional env-specificity.
+ return {"cookiesfrombrowser": ("brave",)}Or alternatively, expose a +parser.add_argument(
+ "--cookies-from-browser",
+ metavar="BROWSER",
+ default=None,
+ help="Browser to extract cookies from (brave/chrome/firefox/etc). Default: none.",
+)and pass Also applies to: 144-144, 320-320, 347-347 🤖 Prompt for AI Agents |
||
| } | ||
| try: | ||
| with yt_dlp.YoutubeDL(opts) as ydl: | ||
| return ydl.extract_info(video_url, download=False) | ||
| return ydl.extract_info(video_url, download=False, process=False) | ||
| except Exception as e: | ||
| log.warning(f"Failed to extract {video_url}: {e}") | ||
| return None | ||
|
Comment on lines
102
to
122
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, check the repository structure and find yt-dlp integration
find . -type f -name "*.py" | grep -E "(youtube|yt_dlp)" | head -20Repository: EtanHey/brainlayer Length of output: 88 🏁 Script executed: # Read the full context of scripts/index_youtube.py to understand how extract_video_info is used
wc -l scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 90 🏁 Script executed: # Look at the get_transcript_from_info function at line 543
sed -n '540,570p' scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 1018 🏁 Script executed: # Check if yt-dlp is vendored or external
find . -type d -name "yt_dlp" | head -5Repository: EtanHey/brainlayer Length of output: 44 🏁 Script executed: # Look for requirements or dependencies
find . -type f -name "requirements*.txt" -o -name "setup.py" -o -name "pyproject.toml" | xargs grep -l "yt-dlp" 2>/dev/nullRepository: EtanHey/brainlayer Length of output: 78 🏁 Script executed: # Check pyproject.toml for yt-dlp dependency
cat pyproject.toml | grep -A5 -B5 "yt-dlp"Repository: EtanHey/brainlayer Length of output: 568 🏁 Script executed: # Look at the full get_transcript_from_info function
sed -n '543,600p' scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 1917 🏁 Script executed: # Check how extract_video_info is called and what info is used
grep -n "extract_video_info" scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 175 🏁 Script executed: # Get more context around the function at lines 102-122 and 543
sed -n '95,130p' scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 1342 🏁 Script executed: # Find the get_transcript_from_info function definition
grep -n "def get_transcript_from_info" scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 128 🏁 Script executed: # Get the full implementation of get_transcript_from_info
sed -n '470,510p' scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 1493 🏁 Script executed: # Check if there are any other references or documentation about process=False
grep -n "process" scripts/index_youtube.py | head -20Repository: EtanHey/brainlayer Length of output: 301 🏁 Script executed: # Look at the imports to understand what yt_dlp is being used
head -50 scripts/index_youtube.py | grep -E "^import|^from"Repository: EtanHey/brainlayer Length of output: 506 🏁 Script executed: # Get the full implementation of get_transcript_from_info
sed -n '170,220p' scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 1992 🏁 Script executed: # Check how the info dict is used in the entire flow
grep -n "info\[" scripts/index_youtube.py | head -20Repository: EtanHey/brainlayer Length of output: 44 🏁 Script executed: # Search for how extract_info result is used elsewhere
grep -n "\.extract_info" scripts/index_youtube.py -A5Repository: EtanHey/brainlayer Length of output: 779 🏁 Script executed: # Check the full context of the index_video function where extract_video_info is called
sed -n '505,560p' scripts/index_youtube.pyRepository: EtanHey/brainlayer Length of output: 1877 🌐 Web query:
💡 Result: In the Python API, What you get for subtitles / automatic captions in “raw extractor output”Even with
These are typically shaped like: {
"en": [{"ext": "vtt", "url": "...", ...}, {"ext": "ttml", "url": "...", ...}],
"ja": [...],
}Why
|
||
|
|
@@ -91,6 +141,7 @@ def get_transcript_via_download(video_url: str) -> list[dict] | None: | |
| "outtmpl": f"{tmpdir}/%(id)s.%(ext)s", | ||
| "quiet": True, | ||
| "no_warnings": True, | ||
| "cookiesfrombrowser": ("brave",), | ||
| } | ||
| try: | ||
| with yt_dlp.YoutubeDL(opts) as ydl: | ||
|
|
@@ -266,6 +317,7 @@ def list_channel_videos(channel_id: str) -> list[dict]: | |
| "extract_flat": True, | ||
| "quiet": True, | ||
| "no_warnings": True, | ||
| "cookiesfrombrowser": ("brave",), | ||
| } | ||
| try: | ||
| with yt_dlp.YoutubeDL(opts) as ydl: | ||
|
|
@@ -292,6 +344,7 @@ def list_playlist_videos(playlist_id: str) -> list[dict]: | |
| "extract_flat": True, | ||
| "quiet": True, | ||
| "no_warnings": True, | ||
| "cookiesfrombrowser": ("brave",), | ||
| } | ||
| try: | ||
| with yt_dlp.YoutubeDL(opts) as ydl: | ||
|
|
@@ -481,9 +534,11 @@ def index_single_video( | |
| log.info(f" Title: {title}") | ||
| log.info(f" Chapters: {len(chapters)}") | ||
|
|
||
| # Get transcript — try download method first (avoids rate limits), | ||
| # fall back to URL-based extraction from info dict | ||
| segments = get_transcript_via_download(video_url) | ||
| # Get transcript — try youtube-transcript-api first (different endpoint, | ||
| # avoids yt-dlp 429s), then yt-dlp download, then URL-based extraction | ||
| segments = get_transcript_via_api(video_id) | ||
| if not segments: | ||
| segments = get_transcript_via_download(video_url) | ||
| if not segments: | ||
| segments = get_transcript_from_info(info) | ||
| if not segments: | ||
|
|
@@ -522,6 +577,11 @@ def index_single_video( | |
| if upload_date: | ||
| meta["upload_date"] = upload_date | ||
|
|
||
| # Convert upload_date (YYYYMMDD) to ISO timestamp for created_at | ||
| created_at = None | ||
| if upload_date and len(upload_date) == 8: | ||
| created_at = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}T00:00:00+00:00" | ||
|
|
||
| chunk_data.append({ | ||
| "id": f"{source_file}:{i}", | ||
| "content": c.content, | ||
|
|
@@ -534,6 +594,7 @@ def index_single_video( | |
| "source": "youtube", | ||
| "conversation_id": source_file, | ||
| "position": i, | ||
| "created_at": created_at, | ||
| }) | ||
| embeddings.append(ec.embedding) | ||
|
|
||
|
|
@@ -555,7 +616,7 @@ def main(): | |
| parser.add_argument("--channel", help="YouTube channel ID (index all videos)") | ||
| parser.add_argument("--playlist", help="YouTube playlist ID") | ||
| parser.add_argument("--project", default="huberman", help="BrainLayer project name") | ||
| parser.add_argument("--db", type=Path, default=DEFAULT_DB, help="BrainLayer DB path") | ||
| parser.add_argument("--db", type=Path, default=DEFAULT_DB_PATH, help="BrainLayer DB path") | ||
| parser.add_argument("--dry-run", action="store_true", help="Show what would be indexed") | ||
| parser.add_argument("--resume", action="store_true", help="Skip already-indexed videos") | ||
| parser.add_argument("--delay", type=float, default=DELAY_BETWEEN_VIDEOS, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧹 Nitpick | 🔵 Trivial
certifilacks a version constraint.All other packages in this group specify a minimum version.
certifiwithout any bound can pull in any release, including future ones with breaking changes, reducing reproducibility.📌 Proposed fix
📝 Committable suggestion
🤖 Prompt for AI Agents