### Build yt_dlp options

```python
ydl_opts = {
    "quiet": True,                # suppress console output
    "skip_download": True,        # donโt download the video, just extract metadata/subtitles
    "writeautomaticsub": True,    # extract auto-generated subtitles (if available)
    "writesubtitles": True,       # extract manually uploaded subtitles
    "subtitleslangs": ["ar"],     # only fetch Arabic subtitles
    "js_runtimes": {
        "node": {}  # tells yt-dlp to use Node.js for executing JavaScript
    },
    "remote_components": {
        "ejs:github": {}  # enables the remote EJS component to solve YouTube JS challenges
    },
    "ignoreerrors": True,
}


### 2. Extract video metadata

```python
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(url, download=False)
```

This returns a **big info dict** containing:

* title
* channel name
* upload date
* tags
* description
* **automatic_captions โ Arabic subtitles list**

"automatic_captions" ->  (`"ar-orig"`, `"ar"`, many laguages) contains format like `json3, srv1, srv2, srt, vtt`.



## ๐ The `extract_arabic_transcript` function
### **3. Extract Arabic transcript**

```python
def extract_arabic_transcript(captions_dict, ydl):
    # Priority: ar-orig > ar
    preferred_keys = ["ar-orig", "ar"]

    for key in preferred_keys:
        if key in captions_dict:
            # Prefer JSON3 for easiest text extraction
            for item in captions_dict[key]:
                if item.get("ext") == "json3":
                    subtitle_url = item["url"]
                    data = ydl.urlopen(subtitle_url).read().decode("utf-8")
                    json_data = json.loads(data)

                    # Extract all text segments
                    parts = []
                    for ev in json_data.get("events", []):
                        for seg in ev.get("segs", []):
                            if "utf8" in seg:
                                parts.append(seg["utf8"].strip())

                    return " ".join(parts).strip()
    return ""

```

**Step A: Pick the right caption source**

```python
preferred_keys = ["ar-orig", "ar"]
```

* We prefer **ar-orig** (original Arabic ASR)
* If unavailable, fall back to **ar** (Arabic auto-translated)


**Step B: Choose the best format โ `json3`**

```python
if item.get("ext") == "json3":
```

Why json3?

* It contains **clearly structured text**
* Easy to extract text segments
* No timestamp formatting issues like VTT or SRT

**Step C: Download subtitle**

```python
subtitle_url = item["url"]
data = ydl.urlopen(subtitle_url).read().decode("utf-8")
json_data = json.loads(data)
```

This downloads the auto-generated subtitles **directly** from YouTubeโs timedtext API.

**Step D: Extract the words**

YouTube stores ASR text like this:

```json
"events": [
  { "segs": [{"utf8": "ูุฑุญุจุง"}] },
  { "segs": [{"utf8": "ููู"} , {"utf8": "ุญุงููู"}] }
]
```

The code:

```python
for ev in json_data.get("events", []):
    for seg in ev.get("segs", []):
        if "utf8" in seg:
            parts.append(seg["utf8"].strip())
```

This pulls all `"utf8"` text segments โ builds them into a list.

**Step E: Combine all segments**

Now you have **one clean Arabic transcript** with no timestamps.


### 4. Build the final JSON output

```python
{
    "video_id": info.get("id"),
    "title": info.get("title"),
    "channel": info.get("channel"),
    "published_at": info.get("upload_date"),
    "tags": info.get("tags") or [],
    "description": info.get("description"),
    "transcript": transcript
}
```


## Putting it all together


In [1]:
# !pip install --upgrade "yt-dlp[default]"


Collecting yt-dlp[default]
  Downloading yt_dlp-2025.11.12-py3-none-any.whl.metadata (180 kB)
[?25l     [90mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ[0m [32m0.0/180.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ[0m[91mโธ[0m[90mโ[0m [32m174.1/180.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:01[0m[2K     [90mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ[0m [32m180.0/180.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting mutagen (from yt-dlp[default])
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Collecting yt-dlp-ejs==0.3.1 (from yt-dlp[default])
  Downloading yt_dlp_ejs-0.3.1-py3-none-any.whl.metadata (2.7 kB)
Downloading yt_dlp_ejs-0.3.1-py3-none-any.whl (53 kB)
[2K   [90mโโโ

In [None]:
import yt_dlp
import json
import gzip

import time
import random
import gzip
import json

def safe_fetch_url(ydl, url, retries=5):
    """Fetch subtitle URL with retries + exponential backoff to avoid blocking."""
    wait = 1
    for attempt in range(retries):
        try:
            raw = ydl.urlopen(url).read()
            return raw
        except Exception as e:
            print(f"[RETRY] Failed to fetch subtitle ({attempt+1}/{retries}) โ {e}")

            # Sleep with jitter to avoid patterns
            time.sleep(wait + random.uniform(0.2, 1.0))
            wait *= 2  # exponential backoff

    print(f"[ERROR] Giving up on subtitle URL: {url}")
    return None


def extract_arabic_transcript(info, ydl):
    """
    Extract the Arabic transcript from a video's subtitle metadata.
    """

    captions_sources = [
        info.get("subtitles", {}),
        info.get("automatic_captions", {})
    ]

    preferred_keys = ["ar-orig", "ar", "ar-*"]

    for captions_dict in captions_sources:
        if not captions_dict:
            continue

        for pref in preferred_keys:
            for key in captions_dict.keys():

                if pref == key or (pref.endswith("*") and key.startswith(pref[:-1])):

                    for item in captions_dict[key]:
                        if item.get("ext") == "json3":
                            url = item["url"]

                            # Safe fetching (with retries)
                            raw = safe_fetch_url(ydl, url)
                            if raw is None:
                                continue  # skip this caption

                            # Handle gzip if present
                            try:
                                raw = gzip.decompress(raw)
                            except:
                                pass

                            try:
                                json_data = json.loads(raw.decode("utf-8"))
                            except Exception as e:
                                print(f"[ERROR] Failed to parse JSON3 subtitle: {url} โ {e}")
                                continue

                            parts = []
                            for ev in json_data.get("events", []):
                                for seg in ev.get("segs", []):
                                    txt = seg.get("utf8")
                                    if txt:
                                        parts.append(txt.strip())

                            return " ".join(parts).strip()

    return ""  # No transcript found


def get_video_data(url):
    # Random sleep to reduce the scraping signature
    time.sleep(random.uniform(1.0, 2.5))

    ydl_opts = {
        "quiet": True,
        "skip_download": True,
        "writeautomaticsub": True,
        "writesubtitles": True,
        "subtitleslangs": ["ar", "ar-orig", "ar-*"],
        "js_runtimes": {"node": {}},
        "remote_components": {"ejs:github": {}},
        "ignoreerrors": True,

    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(url, download=False)
        except Exception as e:
            print(f"[ERROR] Failed to extract metadata for: {url} โ {e}")
            return None

        transcript = extract_arabic_transcript(info, ydl)

        if not transcript:
            print(f"[WARNING] No Arabic transcript for: {url}")

        return {
            "video_id": info.get("id"),
            "title": info.get("title"),
            "channel": info.get("channel"),
            "published_at": info.get("upload_date"),
            "description": info.get("description"),
            "transcript": transcript
        }

# Example usage:
url = "https://www.youtube.com/watch?v=ivs3FoqU3b4"
data = get_video_data(url)
print(json.dumps(data, ensure_ascii=False, indent=2))


{
  "video_id": "ivs3FoqU3b4",
  "title": "ุนูููุง : ุชุฌุฑุจุฉ ุฃูุถู ุชูุน ุชุฑุจุฉ ููุฒุฑุงุนุฉ ุงูููุฒููุฉ - ูุง ุงูุฃูุถู ููุฅูุจุงุช ุ ููุง ุงูุฃูุถู ููุฒุฑุงุนุฉ ูุงูุฅูุชุงุฌ ุ",
  "channel": "ุจุงููุฎุชุตุฑ ุงููููุฏ - ูุณุนุฏ ูุตุงุฑ",
  "published_at": "20250407",
  "tags": [],
  "description": "ููุถุญ ุงูููุฏูู ุฃูุถู ุฃููุงุน ููุชุฑุจุฉ ุงูุฒุฑุงุนูุฉ ุงูููุฒููุฉ ุจุญูุซ ุชููู ุฌูุฏุฉ ุงูุตุฑู ูุบููุฉ ุจุงูุนูุงุตุฑ ุงูุบุฐุงุฆูุฉ ูุงููุนุงุฏู ุงููุงุฒูุฉ ูุงุญุชูุงุฌุงุช ุงููุจุงุช ููุง ูู ุฃูุถู ุชุฑุจุฉ ุตุงูุญุฉ ููุงุณุชูุงุจ ููุง ุงูุชุฑุจุฉ ุงูุฌูุฏุฉ ููุฒุฑุงุนุฉ ุงูููุฒููุฉ\nููููุงุช ุงูุชุฑุจุฉ ุงูุฌูุฏุฉ\n-----------------------------\nูก- ุชุฑุจุฉ ูุญููุฉ\nูข- ุชุฑุจุฉ ุฑูููุฉ\nูฃ- ููุฑูู ูููุจุณุช\nูค- ูููุจุณุช ุญููุงูู ุฃู ูุจุงุชู\nูฅ- ุฃูุฑุงู ุดุฌุฑ ุฌุงูุฉ\nูฆ- ูุญู ูุฌุฑูุด\nูง- ุชุฑุจุฉ ุตุฎุฑูุฉ ( ุญุตู )\n#ุจ

# batch collect

In [3]:
# Step 1: Load URLs from file
with open("video_urls.txt", "r", encoding="utf-8") as f:
    video_urls = [line.strip() for line in f]

print(len(video_urls))


50


In [4]:
from tqdm import tqdm

# Step 2: Loop through URLs and save output
with open("videos_data.jsonl", "w", encoding="utf-8") as out_file:
    for url in tqdm(video_urls, desc="Processing videos"):
        data = get_video_data(url)

        if data is None:
            continue  # error already logged

        out_file.write(json.dumps(data, ensure_ascii=False) + "\n")

print(f"Saved {len(video_urls)} videos.")


Processing videos:  78%|โโโโโโโโ  | 39/50 [03:25<00:54,  4.98s/it]



Processing videos: 100%|โโโโโโโโโโ| 50/50 [04:24<00:00,  5.30s/it]

Saved 50 videos.





the Right-to-Left Mark (U+200F), common in Arabic text. Itโs invisible but can appear in your JSON/strings when saving or printing. It doesnโt break JSON, but it can be annoying.

In [4]:
import json
import re

def clean_text(text):
    if text:
        return re.sub(r'[\u200B-\u200F\u202A-\u202E]', '', text).strip()
    return ""

cleaned_videos = []

# Load, clean, and add stats
with open("videos_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = clean_text(line)
        if not line:
            continue
        data = json.loads(line)
        transcript_text = data.get("transcript", "")
        data["word_count"] = len(transcript_text.split())
        cleaned_videos.append(data)

# Overwrite the same JSONL file
with open("videos_data.jsonl", "w", encoding="utf-8") as f:
    for video in cleaned_videos:
        f.write(json.dumps(video, ensure_ascii=False) + "\n")

print(f"Overwritten videos_data.jsonl with {len(cleaned_videos)} cleaned videos")


Overwritten videos_data.jsonl with 50 cleaned videos


In [5]:
# Find the video with the largest word count
if cleaned_videos:
    max_video = max(cleaned_videos, key=lambda x: x.get("word_count", 0))
    print("Video with largest transcript:")
    print(f"Title       : {max_video.get('title')}")
    print(f"URL         : https://www.youtube.com/watch?v={max_video.get('video_id')}")
    print(f"Word count  : {max_video.get('word_count')}")
else:
    print("No videos found.")


Video with largest transcript:
Title       : ุฏูุฑุฉ ูู ุงูุฒุฑุงุนุฉ ุงูููุฒููุฉ ( ุญููุฉ ูุฌูุนุฉ )
URL         : https://www.youtube.com/watch?v=ebnFGdUhjcA
Word count  : 6114
