# Setup & Load Clean Data

In [None]:
# ============================================================
# Setup & Environment for Feature Engineering
# Generates 5 datasets:
# structured, text, image, regression, classification
# ============================================================

# --- Core Imports ---
import os
import pandas as pd
import numpy as np
from pathlib import Path
from io import BytesIO
from PIL import Image, UnidentifiedImageError
from tqdm import tqdm
import requests

# --- ML / Image Processing ---
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision import models

# --- Text & Sentiment ---
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD

# --- Paths ---
base = Path.cwd().parent
processed_path = base / "data" / "processed"

# --- Load Cleaned Dataset ---
df = pd.read_parquet(processed_path / "youtube_clean_final.parquet")
print("‚úÖ Loaded cleaned dataset:" , df.shape)

# --- Device & ResNet50 Model Setup (for Thumbnail Features) ---
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load pretrained ResNet50 (ImageNet weights)
resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
resnet50.fc = nn.Identity()  # remove final classifier
resnet50 = resnet50.to(device)
resnet50.eval()

# Define preprocessing transform (ImageNet normalization)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
print("‚úÖ ResNet50 loaded and ready for image feature extraction.")
df.head()

# Basic Text Cleaning

In [None]:
# --- Clean title column ---
df["title"] = df["title"].astype(str).str.strip()
print("Sample titles:")
print(df["title"].head(5))

# Structured Title Features

In [None]:
# --- Handcrafted title features ---
df["title_length"] = df["title"].apply(len)
df["word_count"] = df["title"].apply(lambda x: len(x.split()))
df["caps_ratio"] = df["title"].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
df["has_question"] = df["title"].apply(lambda x: int("?" in x))
df["has_exclamation"] = df["title"].apply(lambda x: int("!" in x))
df["has_number"] = df["title"].apply(lambda x: int(any(ch.isdigit() for ch in x)))
df["avg_word_len"] = df["title"].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)

print("‚úÖ Added basic title features.")
df[["title", "title_length", "word_count", "caps_ratio", "has_question", "has_exclamation", "has_number", "avg_word_len"]].head()

# Sentiment Feature

In [None]:
# --- Sentiment analysis ---
analyzer = SentimentIntensityAnalyzer()

df["sentiment_vader"] = df["title"].apply(lambda x: analyzer.polarity_scores(x)["compound"])

print("‚úÖ Added sentiment feature.")
df[["title", "sentiment_vader"]].head()

# Structured Feature Matrix + Targets

In [None]:
# --- Structured features ---
structured_features = [
    "title_length", "word_count", "caps_ratio",
    "has_question", "has_exclamation", "has_number", "avg_word_len",
    "sentiment_vader",
    "subscribers"
]

X_structured = df[structured_features].copy()

# --- Targets ---
y_reg = df["views_per_subscriber"]       # regression target (continuous)
y_clf = (df["views_per_subscriber"] >= df["views_per_subscriber"].quantile(0.75)).astype(int)  # top 25% success

print("Structured feature matrix:", X_structured.shape)
print("Regression target shape:", y_reg.shape)
print("Classification target distribution:\n", y_clf.value_counts(normalize=True))

In [None]:
# --- Handle outliers for regression target ---

# Option 1: Clip extreme values (simple and intuitive)
df["views_per_subscriber"] = np.clip(df["views_per_subscriber"], 0, 500)

# Option 2 (alternative): Use log transform for smoother distribution
# df["views_per_subscriber_log"] = np.log1p(df["views_per_subscriber"])

# Then redefine regression target variable to use this cleaned version
y_reg = df["views_per_subscriber"]

# (If you used log version, change above line to y_reg = df["views_per_subscriber_log"])

# Save Structured Datasets

In [None]:
# --- Save structured features and targets ---
X_structured.to_parquet(processed_path / "youtube_features_structured.parquet", index=False)
y_reg.to_frame("views_per_subscriber").to_parquet(processed_path / "youtube_target_regression.parquet", index=False)
y_clf.to_frame("high_clickability").to_parquet(processed_path / "youtube_target_classification.parquet", index=False)

print("‚úÖ Saved structured features and targets.")
print("‚úÖ Saved cleaned regression target to:", processed_path / "youtube_target_regression.parquet")
print("‚úÖ Saved cleaned classification target to:", processed_path / "youtube_target_classification.parquet")

# TF-IDF Text Features (Unigrams + Bigrams)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# --- TF-IDF setup ---
tfidf = TfidfVectorizer(
    max_features=1000,
    stop_words="english",
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(df["title"])
print("Raw TF-IDF shape:", tfidf_matrix.shape)

# --- Dimensionality reduction (TruncatedSVD) ---
svd = TruncatedSVD(n_components=50, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf_matrix)

print("Reduced TF-IDF shape:", tfidf_reduced.shape)

# --- Convert to DataFrame ---
tfidf_cols = [f"tfidf_comp_{i+1}" for i in range(tfidf_reduced.shape[1])]
X_tfidf = pd.DataFrame(tfidf_reduced, columns=tfidf_cols)

X_tfidf.head()

In [None]:
# see what each component means
terms = tfidf.get_feature_names_out()
for i, comp in enumerate(svd.components_[:5]):  # first 5 components
    top_terms = [terms[x] for x in comp.argsort()[-10:][::-1]]
    print(f"Component {i+1}: {', '.join(top_terms)}")


# Save TF-IDF Dataset

In [None]:
# --- Save reduced TF-IDF features ---
X_tfidf.to_parquet(processed_path / "youtube_features_text.parquet", index=False)

print("‚úÖ Saved text-based TF-IDF features to:", processed_path / "youtube_features_text.parquet")

# Extract Image Features

In [None]:
# --- Downloads thumbnail and returns a 2048-D ResNet50 embedding. ---

def extract_features_from_url(url: str) -> np.ndarray:
    try:
        if not isinstance(url, str) or not url.startswith("http"):
            raise ValueError("Invalid URL")
        # Try high-quality thumbnail if available
        url_hq = url.replace("/default.jpg", "/hqdefault.jpg")
        # Download image
        response = requests.get(url_hq, timeout=8)
        response.raise_for_status()
        # Decode image
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)
        # Extract ResNet embedding
        with torch.no_grad():
            features = resnet50(img_tensor).cpu().numpy().flatten()
        return features.astype(np.float32)

    except (requests.exceptions.RequestException, UnidentifiedImageError, ValueError):
        # Return zero vector if the image is missing or invalid
        return np.zeros(2048, dtype=np.float32) 

In [None]:
# --- Applies extraction with a progress bar ---

tqdm.pandas(desc="Extracting ResNet features")
df["resnet_features"] = df["thumbnail_link"].progress_apply(extract_features_from_url)

# Count failed downloads
fail_count = sum(np.all(f == 0) for f in df["resnet_features"])
print(f"‚ö†Ô∏è Failed to download or decode {fail_count:,} thumbnails "
      f"({100*fail_count/len(df):.1f}%)")

# Apply PCA (dimensionality reduction)

In [None]:
# Stacks all 2048-D feature vectors into one big matrix.
# Uses PCA to reduce them to 50 features while keeping ‚âà 95 % of the important variance.

# --- Remove old PCA columns ---
pca_cols = [c for c in df.columns if c.startswith("pca_")]
if pca_cols:
    df = df.drop(columns=pca_cols)
    
# --- PCA Computation ---
X = np.vstack(df["resnet_features"].values)
print("Raw feature matrix shape:", X.shape)

pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X)

df_pca = pd.DataFrame(X_pca, columns=[f"pca_{i+1}" for i in range(50)])
df = pd.concat([df.reset_index(drop=True), df_pca], axis=1)

print("‚úÖ PCA recomputed ‚Üí shape:", df_pca.shape)

# Save Image Features Dataset

In [None]:
# Saving the dataset as "youtube_features_image.parquet"

# Keep only the PCA columns
pca_cols = [f"pca_{i}" for i in range(1, 51)]
df_pure_image = df[pca_cols].copy()
df_pure_image.to_parquet(processed_path / "youtube_features_image.parquet", index=False)

print("‚úÖ Saved pure thumbnail dataset to:", processed_path / "youtube_features_image.parquet")
print("Shape:", df_pure_image.shape)

In [None]:
# Verify saved image feature dataset

df_check = pd.read_parquet(processed_path / "youtube_features_image.parquet")
print(f"‚úÖ Loaded {len(df_check):,} rows √ó {len(df_check.columns)} columns")
print("Sample columns:", df_check.columns[-10:].to_list())

# Summary Check

In [None]:
# Summary of all saved feature datasets
print("‚úÖ Summary of saved datasets:")

print(f"‚Ä¢ Structured features:       {X_structured.shape}")
print(f"‚Ä¢ Text (TF-IDF) features:    {X_tfidf.shape}")
print(f"‚Ä¢ Image (PCA) features:      {df_pure_image.shape}")
print(f"‚Ä¢ Regression target:         {y_reg.shape}")
print(f"‚Ä¢ Classification target:     {y_clf.shape}")

print("\nüìÇ All datasets saved under:")
print(processed_path.resolve())