In [1]:
# ============================================================
# 03_thumbnail_features.ipynb
# Extract deep visual features from YouTube thumbnails (ResNet50)
# ============================================================

# --- System & Data Handling ---
import os
from io import BytesIO
from pathlib import Path
import numpy as np
import pandas as pd
import requests
from PIL import Image, UnidentifiedImageError
from tqdm import tqdm

# --- ML & Feature Extraction ---
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision import models

# --- Sentiment Analysis (optional) ---
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# --- Dimensionality Reduction ---
from sklearn.decomposition import PCA

In [2]:
# ============================================================
# Setup for environment & paths
# ============================================================

# Detect Apple Silicon GPU or fallback to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("✅ Using device:", device)

# Set directories
base = Path.cwd().parent if (Path.cwd() / "notebooks").exists() else Path.cwd()
processed_path = base / "data" / "processed"
processed_path.mkdir(parents=True, exist_ok=True)

✅ Using device: mps


In [3]:
# ============================================================
# Loads a pretrained deep neural network (ResNet-50) that already knows how to recognize image features from ImageNet.
# ============================================================

# Load pretrained model (ImageNet)
resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
resnet50.fc = nn.Identity()   # remove final classifier
resnet50 = resnet50.to(device)
resnet50.eval()

# Define preprocessing (ImageNet normalization)
transform = transforms.Compose([
    transforms.Resize((224, 224)), # Matches ResNet’s expected input shape
    transforms.ToTensor(), 
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
print("✅ ResNet50 loaded and ready.")

✅ ResNet50 loaded and ready.


In [4]:
# ============================================================
# Load cleaned dataset from disk into a Pandas DataFrame
# ============================================================

df_path = Path("../data/processed/youtube_clean_final.parquet")
df = pd.read_parquet(df_path) 
print(f"✅ Loaded {len(df):,} rows and {len(df.columns)} columns.")
print("Columns:", df.columns[:8].to_list())

✅ Loaded 5,742 rows and 19 columns.
Columns: ['video_id', 'trending_date', 'title', 'channel_title', 'category_id', 'publish_time', 'tags', 'views']


In [5]:
def extract_features_from_url(url: str) -> np.ndarray:
    """
    Downloads a thumbnail and returns a 2048-D ResNet50 embedding.
    Falls back to zeros if the image is missing or invalid.
    """
    try:
        if not isinstance(url, str) or not url.startswith("http"):
            raise ValueError("Invalid URL")

        # Try high-quality thumbnail if available
        url_hq = url.replace("/default.jpg", "/hqdefault.jpg")

        # Download image
        response = requests.get(url_hq, timeout=8)
        response.raise_for_status()

        # Decode image
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)

        # Extract ResNet embedding
        with torch.no_grad():
            features = resnet50(img_tensor).cpu().numpy().flatten()

        return features.astype(np.float32)

    except (requests.exceptions.RequestException, UnidentifiedImageError, ValueError):
        # Return zero vector if any error occurs
        return np.zeros(2048, dtype=np.float32)

In [6]:
# ============================================================
# Applies extraction with a progress bar -> visibility into data quality
# ============================================================

tqdm.pandas(desc="Extracting ResNet features")

df["resnet_features"] = df["thumbnail_link"].progress_apply(extract_features_from_url)

# Count failed downloads
fail_count = sum(np.all(f == 0) for f in df["resnet_features"])
print(f"⚠️ Failed to download or decode {fail_count:,} thumbnails "
      f"({100*fail_count/len(df):.1f}%)")

# Optional checkpoint
tmp_path = processed_path / "resnet_raw_features.parquet"
df[["video_id", "resnet_features"]].to_parquet(tmp_path, index=False)
print("Saved checkpoint:", tmp_path)

Extracting ResNet features: 100%|██████████| 5742/5742 [05:34<00:00, 17.16it/s]


⚠️ Failed to download or decode 526 thumbnails (9.2%)
Saved checkpoint: /Users/jinbo/Downloads/YouTube_Clickability_Study/notebooks/data/processed/resnet_raw_features.parquet


In [24]:
# ============================================================
# Stacks all 2048-D feature vectors into one big matrix.
# Uses **PCA** to reduce them to 50 features while keeping most of the important variance (≈ 95 %).
# ============================================================
# --- Remove old PCA columns ---
pca_cols = [c for c in df.columns if c.startswith("pca_")]
if pca_cols:
    print(f"Removing {len(pca_cols)} old PCA columns...")
    df = df.drop(columns=pca_cols)
    
# --- PCA Computation ---
X = np.vstack(df["resnet_features"].values)
print("Raw feature matrix shape:", X.shape)

pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X)

df_pca = pd.DataFrame(X_pca, columns=[f"pca_{i+1}" for i in range(50)])
df = pd.concat([df.reset_index(drop=True), df_pca], axis=1)

print("✅ PCA recomputed → shape:", df_pca.shape)

Removing 50 old PCA columns...
Raw feature matrix shape: (5742, 2048)
✅ PCA recomputed → shape: (5742, 50)


In [25]:
# ============================================================
# Saving the dataset as "youtube_thumbnail_features.parquet"
# ============================================================
# Keep only the PCA columns
pca_cols = [f"pca_{i}" for i in range(1, 51)]
df_pure_image = df[pca_cols].copy()

# Define output path
out_path = df_path.parent / "youtube_features_image.parquet"

# Save PCA-only features
df_pure_image.to_parquet(out_path, index=False)

print("✅ Saved pure thumbnail dataset to:", out_path.resolve())
print("Shape:", df_pure_image.shape)

✅ Saved pure thumbnail dataset to: /Users/jinbo/Downloads/YouTube_Clickability_Study/data/processed/youtube_features_image.parquet
Shape: (5742, 50)


In [26]:
# ============================================================
# Re-loads and prints shape + tail of column names for testing
# ============================================================

df_check = pd.read_parquet(out_path)
print(f"Loaded {len(df_check):,} rows × {len(df_check.columns)} columns")
print("Sample columns:", df_check.columns[-10:].to_list())

Loaded 5,742 rows × 50 columns
Sample columns: ['pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_50']
