## Necessary Imports

In [2]:
# Standard library
import os
import re
import json
from typing import List, Dict, Optional
from urllib.parse import urlparse

# Third-party
import pandas as pd
from tqdm import tqdm
from googleapiclient.discovery import build


In [3]:
from google.colab import userdata
API_KEY = userdata.get('YOUTUBE_API_KEY')
yt = build("youtube", "v3", developerKey=API_KEY, cache_discovery=False)

## Helper Functions


This function parses a list of channel identifiers and normalizes them into a canonical representation that can be safely passed to the YouTube Data API.

The pipeline **only supports identifiers that can be resolved deterministically using free, official API functionality**.

#### Supported input types
- **Channel ID**  
  Example: `UCxxxxxxxxxxxxxxxxxxxxxx`
  
- **Channel handle**  
  Example: `@Netflix`

- **YouTube URLs** that contain either:
  - a handle: `https://www.youtube.com/@Netflix`
  - a channel ID: `https://www.youtube.com/channel/UCxxxx...`

In [8]:
CHANNEL_ID_RE = re.compile(r"^UC[a-zA-Z0-9_-]{20,}$")

def normalize_channel_input(s: str) -> dict:
    """
    Returns one of:
      {"type": "channel_id", "value": "..."}
      {"type": "handle", "value": "@..."}
    """
    s = s.strip()

    # Raw channel ID, use the regex to identify
    if CHANNEL_ID_RE.match(s):
        return {"type": "channel_id", "value": s}

    # URL cases
    if s.startswith("https://"):
        path = urlparse(s).path.strip("/") # returns the path
        # /@Handle
        if path.startswith("@"):
            return {"type": "handle", "value": "@" + path[1:]}
        # /channel/UCxxxx
        if path.startswith("channel/"):
            cid = path.split("/", 1)[1]
            if CHANNEL_ID_RE.match(cid):
                return {"type": "channel_id", "value": cid}

    # Handle without @
    if s and not s.startswith("@") and not s.startswith("UC"):
        # assume it's a handle-like string
        return {"type": "handle", "value": "@" + s}

    # Handle with @
    if s.startswith("@"):
        return {"type": "handle", "value": s}

    raise ValueError(f"Unrecognized channel identifier: {s}")


Resolves a YouTube channel **handle** (e.g., `@ScreenRantPlus`) into YouTube assigned id object containing:

- `channel_id`
- `channel_title`
- `uploads_playlist_id`

This uses the official YouTube Data API `channels.list(forHandle=...)`.

In [7]:
def resolve_input(input: str, type: str) -> dict:
    """
    handle: '@ScreenRantPlus'
    Returns dict with channel_id, title, uploads_playlist_id
    """

    if type == "handle":
        resp = yt.channels().list(
            part="id,snippet,contentDetails",
            forHandle=input,
            maxResults=1
        ).execute()

        items = resp.get("items", [])
        if not items:
            return {"ok": False, "handle": input, "error": "handle_not_found"}

        ch = items[0]
        return {
            "ok": True,
            "input": handle,
            "channel_id": ch["id"],
            "channel_title": ch["snippet"]["title"],
            "uploads_playlist_id": ch["contentDetails"]["relatedPlaylists"]["uploads"],
        }
    elif type == "channel_id":
        resp = yt.channels().list(
            part="id,snippet,contentDetails",
            id=input,
            maxResults=1
        ).execute()
        items = resp.get("items", [])
        if not items:
            return {"ok": False, "input": input, "error": "channel_id_not_found"}
        ch = items[0]
        return {
            "ok": True,
            "input": input,
            "channel_id": ch["id"],
            "channel_title": ch["snippet"]["title"],
            "uploads_playlist_id": ch["contentDetails"]["relatedPlaylists"]["uploads"],
        }


Converts a list of channel identifiers (handles, channel IDs, or supported URLs) into a
list of channel records. Each record contains:

- `channel_id`
- `channel_title`
- `uploads_playlist_id`

This function relies on `normalize_channel_input` to determine whether the input is a
handle or a channel ID, and then resolves it accordingly.

In [20]:
def parse_channels(channels: list[str]) -> list[dict]:
    results = []
    for raw in channels:
        info = normalize_channel_input(raw)

        if info["type"] == "channel_id":
              results.append(resolve_input(info["value"], "channel_id"))

        elif info["type"] == "handle":
            results.append(resolve_input(info["value"], "handle"))

    return results


This function enumerates **video IDs only** from a channel’s uploads playlist using the
official `yt.playlistItems().list` client.

It is optimized for downstream batching with `videos.list`, which allows richer
metadata (duration, statistics) to be fetched efficiently in bulk.

In [11]:
def list_upload_video_ids(uploads_playlist_id: str, max_videos: int = 200) -> List[str]:
    video_ids: List[str] = []
    page_token: Optional[str] = None

    while True:
        resp = yt.playlistItems().list(
            part="contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=min(50, max_videos - len(video_ids)),
            pageToken=page_token
        ).execute()

        for it in resp.get("items", []):
            vid = it.get("contentDetails", {}).get("videoId")
            if vid:
                video_ids.append(vid)
                if len(video_ids) >= max_videos:
                    return video_ids

        page_token = resp.get("nextPageToken")
        if not page_token:
            break

    return video_ids


This is used to batch
video IDs into groups of 50, which is the maximum number of IDs supported by the
YouTube Data API `videos.list` endpoint per request.

In [12]:
def chunked(xs: List[str], n: int = 50):
    for i in range(0, len(xs), n):
        yield xs[i:i+n]

Fetches metadata for a list of video IDs using the YouTube Data API `videos.list`
endpoint. The API accepts up to **50 video IDs per request**, so this function batches
requests using `chunked`.

In [21]:
def fetch_video_details(video_ids: List[str]) -> List[Dict]:
    items: List[Dict] = []
    for batch in chunked(video_ids, 50):
        resp = yt.videos().list(
            part="id,snippet,contentDetails,statistics",
            id=",".join(batch),
            maxResults=50
        ).execute()
        items.extend(resp.get("items", []))
    return items

End-to-end function that builds a structured DataFrame of videos from a list of channels.

Pipeline:
1. Normalize channel identifiers (handles / channel IDs / supported URLs)
2. Resolve channels to obtain:
   - channel_id
   - channel_title
   - uploads_playlist_id
3. Fetch channel description (stored as a channel-level feature)
4. Enumerate uploaded video IDs via the uploads playlist
5. Fetch full video metadata in batches
6. Return a single `pandas.DataFrame` containing all video rows

In [25]:
import pandas as pd
from typing import List

def parse_videos_for_channels(
    channels: List[str],
    max_videos_per_channel: int = 100
) -> pd.DataFrame:
    """
    End-to-end:
    channel identifiers -> uploads playlist -> videos.list -> DataFrame

    Includes:
      - source_channel_description
    """

    # Normalize inputs and keep only resolvable ones
    resolvable_channels = []
    for c in channels:
        obj = normalize_channel_input(c)
        if obj["type"] in {"handle", "channel_id"}:
            resolvable_channels.append(c)

    # Resolve channels → channel_id + uploads playlist (+ description now)
    channel_rows = parse_channels(resolvable_channels)

    rows = []

    for ch in channel_rows:
        if not ch.get("ok"):
            continue

        channel_description = ch.get("channel_description", "")

        # Fetch videos
        video_ids = list_upload_video_ids(
            ch["uploads_playlist_id"],
            max_videos=max_videos_per_channel
        )
        videos = fetch_video_details(video_ids)

        for v in videos:
            snippet = v.get("snippet") or {}
            content = v.get("contentDetails") or {}
            stats = v.get("statistics") or {}
            vid = v.get("id")

            rows.append({
                "source_channel_input": ch.get("input"),
                "source_channel_id": ch.get("channel_id"),
                "source_channel_title": ch.get("channel_title"),
                "source_channel_description": channel_description,

                "video_id": vid,
                "watch_url": f"https://www.youtube.com/watch?v={vid}" if vid else None,

                "title": snippet.get("title"),
                "published_at": snippet.get("publishedAt"),
                "description": snippet.get("description", ""),

                "duration_iso8601": content.get("duration"),

                "view_count": int(stats["viewCount"]) if "viewCount" in stats else None,
                "like_count": int(stats["likeCount"]) if "likeCount" in stats else None,
                "comment_count": int(stats["commentCount"]) if "commentCount" in stats else None,
            })

    return pd.DataFrame(rows)


### Training Dataset

We construct a simple labeled training dataset using two curated channel lists:

1. **OFFICIAL_CHANNELS (label = 0)**  
   A list of well-known, official studio/distributor channels that have a **low likelihood**
   of uploading unauthorized movie clips. For simplicity, we mark all videos collected from
   these channels as **legal / low-risk**.

2. **high_risk_channels (label = 1)**  
   A list of channels that (based on observable behavior and content patterns) have a **higher
   likelihood** of uploading raw movie clips. For simplicity, we mark all videos collected from
   these channels as **high-risk / likely unauthorized**.

> **Note:** These labels represent a **risk-based classification** (triage) rather than a legal
> determination. Licensing agreements are not publicly observable, so the model is intended to
> prioritize videos for review, not make final legal judgments.

In [26]:
OFFICIAL_CHANNELS = [
    "@WarnerBros",
    "@sonypictures",
    "@paramountpictures",
    "@UniversalPictures",
    "@Netflix",
    "@Disney",
    "@20thCenturyStudios",
    "@LionsgateMovies",
    "@marvel",
    "@StarWars",
]

df_legal = parse_videos_for_channels(OFFICIAL_CHANNELS, max_videos_per_channel=200)

df_legal["label"] = 0
df_legal["label_source"] = "official_allowlist"
df_legal["label_confidence"] = "high"
df_legal["dataset_split"] = "legal_negative"

df_legal.shape, df_legal[["source_channel_title","title","watch_url"]].head(10)


((2000, 17),
   source_channel_title                                              title  \
 0         Warner Bros.  ONE BATTLE AFTER ANOTHER | "River of Hills" #s...   
 1         Warner Bros.  Director Martin Scorsese presents ONE BATTLE A...   
 2         Warner Bros.               SINNERS | "Born With A Gift" #shorts   
 3         Warner Bros.  SINNERS | Ryan Coogler's Inspiration Behind th...   
 4         Warner Bros.  Are you ready? Experience #TheBrideMovie only ...   
 5         Warner Bros.  One Battle After Another | "It's One for the A...   
 6         Warner Bros.  SINNERS | “A Love Letter to Black American His...   
 7         Warner Bros.  Your burning questions answered. #WutheringHei...   
 8         Warner Bros.  @InStyle provides a sneak peek of the looks fr...   
 9         Warner Bros.               "Wuthering Heights" | Exclusive Clip   
 
                                      watch_url  
 0  https://www.youtube.com/watch?v=gePaMMcrjjg  
 1  https://www.youtube.com

In [19]:
high_risk_channels = [
    "@8KCINEMA5780",
    "@primeclips1",
    "@ClipsNation",
    "@ApexClips4k",
    "@ActionMoviesT3",
    "@998-qk8vc",
]

df_risk = parse_videos_for_channels(high_risk_channels, max_videos_per_channel=200)

df_risk["label"] = 1
df_risk["label_source"] = "high_risk_allowlist"
df_risk["label_confidence"] = "medium"
df_risk["dataset_split"] = "risk_positive"

df_risk.shape, df_risk[["source_channel_title","title","watch_url"]].head(10)


((670, 17),
   source_channel_title                                              title  \
 0           8K CINEMA   8K HDR | Battle of the Bastards | Game of Thro...   
 1           8K CINEMA   The Odyssey | 1.85 Aspect Ratio Trailer | 4K HDR    
 2           8K CINEMA   4K IMAX HDR | Superman Escapes From Lex Luthor...   
 3           8K CINEMA   4K IMAX HDR | Superman vs Engineer,Ultraman | ...   
 4           8K CINEMA   4K IMAX HDR | Superman vs Kaiju | Dolby 5.1 | ...   
 5           8K CINEMA   8K HDR | Michael Kills All The Heads of The Ot...   
 6           8K CINEMA   4K HDR | Bjorn Ironside Death Scene | Vikings ...   
 7           8K CINEMA   4K HDR | Battle Of Hardhome Scene Game of Thro...   
 8           8K CINEMA   Captain America vs Winter Soldier But Zack Sny...   
 9           8K CINEMA   IMAX 8K UHD | Captain America Best Scenes | Av...   
 
                                      watch_url  
 0  https://www.youtube.com/watch?v=CEhFdhCH1Nw  
 1  https://www.youtube.com/

## Imports for Model Training

In [28]:
pip install -q sentence-transformers scikit-learn isodate

In [29]:
import re
import numpy as np
import pandas as pd
import isodate

from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

Before training the model, we preprocess the dataset to make it suitable for
model training.

We create a unified `text` field by concatenating:
- Channel description  
- Video title  
- Video description  

This allows the transformer to capture both **video-level semantics** and
**channel-level context** in a single embedding.

In addition to text, we derive lightweight structured features:
- Log-scaled engagement metrics (`view_count`, `like_count`, `comment_count`)
- A deterministic `channel_license_signal` derived from channel description text

The `channel_license_signal` encodes prior information:
- `-1` → explicit indicators of official or licensed content (lower risk)
- `+1` → common disclaimer language often associated with unlicensed uploads
- `0` → no strong signal


In [35]:
df_legal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   source_channel_input        2000 non-null   object
 1   source_channel_id           2000 non-null   object
 2   source_channel_title        2000 non-null   object
 3   source_channel_description  2000 non-null   object
 4   video_id                    2000 non-null   object
 5   watch_url                   2000 non-null   object
 6   title                       2000 non-null   object
 7   published_at                2000 non-null   object
 8   description                 2000 non-null   object
 9   duration_iso8601            2000 non-null   object
 10  view_count                  2000 non-null   int64 
 11  like_count                  2000 non-null   int64 
 12  comment_count               2000 non-null   int64 
 13  label                       2000 non-null   int6

In [30]:
# ----------------------------
# Feature engineering helpers
# ----------------------------

def channel_license_signal(desc: str) -> int:
    d = (desc or "").lower()
    if re.search(r"\bofficial\b|\blicensed\b|\bauthorized\b", d):
        return -1  # lower risk
    if re.search(r"no copyright infringement|all rights belong|fair use", d):
        return +1  # higher risk disclaimer language
    return 0


# ----------------------------
# Build training table
# ----------------------------
df_train = pd.concat([df_legal, df_risk], ignore_index=True).copy()
df_train["label"] = df_train["label"].astype(int)

# Text fields (channel description + title + description; optional)
df_train["text"] = (
    df_train["source_channel_description"].fillna("").astype(str) +
    "\n" +
    df_train["title"].fillna("").astype(str) +
    "\n" +
    df_train["description"].fillna("").astype(str) +
    "\n"
).str.strip()

# Numeric features
for col in ["view_count", "like_count", "comment_count"]:

    df_train[col] = pd.to_numeric(df_train[col], errors="coerce")
    df_train[f"log_{col}"] = np.log1p(df_train[col])

df_train["channel_license_signal"] = df_train["source_channel_description"].apply(channel_license_signal)

# Drop empty text rows
df_train = df_train[df_train["text"].str.len() > 0].copy()

print("Dataset size:", df_train.shape)
print(df_train["label"].value_counts())



Dataset size: (2670, 22)
label
0    2000
1     670
Name: count, dtype: int64


With the dataset preprocessed, we split the data into **training** and **test**
sets in preparation for model training and evaluation.

We use `train_test_split` from scikit-learn with the following considerations:

- **80/20 split** between training and test data
- **Stratified sampling** due to class imbalance b/w positive and negative labels

Stratification is important here because the dataset exhibits a mild class
imbalance between low-risk and high-risk samples.

In [32]:
X_text = df_train["text"].tolist()
y = df_train["label"].to_numpy()

X_train_text, X_test_text, y_train, y_test, idx_train, idx_test = train_test_split(
    X_text,
    y,
    df_train.index.to_numpy(),
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [34]:
idx_train

array([ 175,  607,   39, ..., 1035, 1077, 2354])

We keep a single `text` column (channel description + title + video description) so the
transformer can generate one semantic embedding per video.

Separately, we keep structured features as numeric columns (log views/likes/comments,
and a channel-license signal). We then concatenate the embedding vector with these
numeric columns to form the final feature matrix for logistic regression.

In [36]:
# ----------------------------
# Embed 1 text column + keep numeric columns separate
# ----------------------------

# 1) Embed a single text column (already built in Cell 1)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_train_embed = embedder.encode(
    df_train.loc[idx_train, "text"].tolist(),
    batch_size=64, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)

X_test_embed = embedder.encode(
    df_train.loc[idx_test, "text"].tolist(),
    batch_size=64, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)

# 2) Structured numeric features (kept as separate columns)
# NOTE: We do NOT force a direction like "high views => legal".
# The model learns the relationship from the labeled data.
num_cols = [
    "log_view_count",
    "log_like_count",
    "log_comment_count",
    "channel_license_signal",
]

#low-engagement flag (often useful for clip farms)
df_train["low_view_flag"] = (pd.to_numeric(df_train["view_count"], errors="coerce").fillna(0) < 10_000).astype(int)
num_cols.append("low_view_flag")

X_num_train = df_train.loc[idx_train, num_cols].fillna(0.0).to_numpy()
X_num_test  = df_train.loc[idx_test,  num_cols].fillna(0.0).to_numpy()

# 3) Final feature matrices = [embedding || numeric_features]
X_train = np.hstack([X_train_embed, X_num_train])
X_test  = np.hstack([X_test_embed,  X_num_test])

print("Train feature matrix:", X_train.shape)
print("Test feature matrix: ", X_test.shape)

# ----------------------------
# Train Logistic Regression
# ----------------------------
clf = LogisticRegression(max_iter=4000, class_weight="balanced")
clf.fit(X_train, y_train)

# ----------------------------
# Evaluate
# ----------------------------
proba = clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("\nROC-AUC:", roc_auc_score(y_test, proba))
print(classification_report(y_test, pred, digits=3))

# ----------------------------
# Score all rows (rank suspects)
# ----------------------------
X_all_embed = embedder.encode(
    df_train["text"].tolist(),
    batch_size=64, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)

X_all_num = df_train[num_cols].fillna(0.0).to_numpy()
X_all = np.hstack([X_all_embed, X_all_num])

df_train["clip_risk_proba"] = clf.predict_proba(X_all)[:, 1]

df_train.sort_values("clip_risk_proba", ascending=False)[
    ["label", "clip_risk_proba", "source_channel_title", "title", "view_count", "watch_url"]
].head(30)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Train feature matrix: (2136, 389)
Test feature matrix:  (534, 389)

ROC-AUC: 1.0
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       400
           1      1.000     1.000     1.000       134

    accuracy                          1.000       534
   macro avg      1.000     1.000     1.000       534
weighted avg      1.000     1.000     1.000       534



Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Unnamed: 0,label,clip_risk_proba,source_channel_title,title,view_count,watch_url
2571,1,0.998521,CineSpotlight,A tragic story of a boy.#movieclips,3146,https://www.youtube.com/watch?v=9c7kiZKo2ew
2037,1,0.997336,PrimeClips,Alien: Romulus (2024) Rain Shoots The Aliens S...,1756,https://www.youtube.com/watch?v=GY-sCIF7eq8
2562,1,0.995634,CineSpotlight,Boy saved a killer whale.#movieclips #viralvideo,5640,https://www.youtube.com/watch?v=OM8wofVbTZk
2357,1,0.995619,Apex Clips,Ad Astra 4K HDR | Mars Landing,3752,https://www.youtube.com/watch?v=DbsKf6LhESg
2380,1,0.995503,Apex Clips,Alien: Covenant 4K HDR | Death Of The Engineers,882648,https://www.youtube.com/watch?v=92Q7Rv5jT80
2377,1,0.995187,Apex Clips,Alien: Covenant 4K HDR | Ending Fight,58243,https://www.youtube.com/watch?v=lmBBF8sU9lo
2148,1,0.994915,Clips Nation,English M********** | Do you speak it? | Pulp ...,1860,https://www.youtube.com/watch?v=zDIJZ5IE7O0
2143,1,0.994882,Clips Nation,Car Chase Sequence | One Battle After Another ...,231,https://www.youtube.com/watch?v=qOgpJZIy03I
2356,1,0.99488,Apex Clips,Ad Astra 4K HDR | Roy Finds His Dad,4824,https://www.youtube.com/watch?v=ZNMXEhHvyWI
2036,1,0.994866,PrimeClips,Alien: Romulus (2024) Ending Scene | 4K HD,15793,https://www.youtube.com/watch?v=iB63ehPmTFs


After scoring every video with `clip_risk_proba`, we export results to an Excel file.

We provide:
- **All scored videos** (full audit trail)
- **High-risk subset** filtered by a probability threshold (triage list)

> Note: `clip_risk_proba` is a risk score produced by the model and should be interpreted
> as a prioritization signal rather than a legal determination.

In [37]:
RISK_THRESHOLD = 0.95
TOP_N = 500

df_scored = df_train.sort_values("clip_risk_proba", ascending=False).copy()

# High-risk subset
df_high_risk = df_scored[df_scored["clip_risk_proba"] >= RISK_THRESHOLD].copy()

if len(df_high_risk) < 50:
    df_high_risk = df_scored.head(TOP_N).copy()

cols_high_risk = [
    "clip_risk_proba",
    "source_channel_title",
    "source_channel_id",
    "video_id",
    "title",
    "published_at",
    "view_count",
    "like_count",
    "comment_count",
    "watch_url",
    "label_source",
    "label_confidence",
]

cols_all = [
    "label",
    "clip_risk_proba",
    "source_channel_title",
    "source_channel_id",
    "video_id",
    "title",
    "published_at",
    "duration_iso8601",
    "view_count",
    "like_count",
    "comment_count",
    "watch_url",
]

out_path = "youtube_clip_risk_results.xlsx"

with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    df_high_risk[cols_high_risk].to_excel(writer, index=False, sheet_name="high_risk")
    df_scored[cols_all].to_excel(writer, index=False, sheet_name="all_scored")

print(f"Saved Excel to: {out_path}")
print("High-risk rows:", len(df_high_risk))


Saved Excel to: youtube_clip_risk_results.xlsx
High-risk rows: 353
