# Setup & Load Clean Data

In [12]:
# --- Setup ---
import pandas as pd
import numpy as np
from pathlib import Path

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# --- Paths ---
base = Path.cwd().parent
processed_path = base / "data" / "processed"

# --- Load cleaned dataset ---
df = pd.read_parquet(processed_path / "youtube_clean_final.parquet")
print("Loaded cleaned dataset:", df.shape)
df.head()

Loaded cleaned dataset: (5742, 19)


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,subscribers,views_per_subscriber,views_per_subscriber_log
0,-0CMnp02rNY,18.11.06,Mindy Kaling's Daughter Had the Perfect Reacti...,TheEllenShow,24,2018-06-04 13:00:00+00:00,"ellen|""ellen degeneres""|""the ellen show""|""elle...",800359,9773,332,423,https://i.ytimg.com/vi/-0CMnp02rNY/default.jpg,False,False,False,Ocean's 8 star Mindy Kaling dished on bringing...,23760020.0,0.033685,0.03313
1,-0NYY8cqdiQ,18.01.02,Megan Mullally Didn't Notice the Interesting P...,TheEllenShow,24,2018-01-29 14:00:39+00:00,"megan mullally|""megan""|""mullally""|""will and gr...",563746,4429,54,94,https://i.ytimg.com/vi/-0NYY8cqdiQ/default.jpg,False,False,False,Ellen and Megan Mullally have known each other...,23760020.0,0.023727,0.02345
2,-1Hm41N0dUs,18.01.05,Cast of Avengers: Infinity War Draws Their Cha...,Jimmy Kimmel Live,23,2018-04-27 07:30:02+00:00,"jimmy|""jimmy kimmel""|""jimmy kimmel live""|""late...",2058516,41248,580,1484,https://i.ytimg.com/vi/-1Hm41N0dUs/default.jpg,False,False,False,"Benedict Cumberbatch, Don Cheadle, Elizabeth O...",11262900.0,0.18277,0.167859
3,-1yT-K3c6YI,17.02.12,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,Molly Burke,22,2017-11-28 18:30:43+00:00,"youtube quiz|""youtuber quiz""|""truth or dare""|""...",231341,7734,212,846,https://i.ytimg.com/vi/-1yT-K3c6YI/default.jpg,False,False,False,Check out the video we did on the Merrell Twin...,274004.0,0.844295,0.612097
4,-2RVw2_QyxQ,17.16.11,2017 Champions Showdown: Day 3,Saint Louis Chess Club,27,2017-11-12 02:39:01+00:00,"Chess|""Saint Louis""|""Club""",71089,460,27,20,https://i.ytimg.com/vi/-2RVw2_QyxQ/default.jpg,False,False,False,The Saint Louis Chess Club hosts a series of f...,147718.0,0.481245,0.392883


# Basic Text Cleaning

In [13]:
# --- Clean title column ---
df["title"] = df["title"].astype(str).str.strip()
print("Sample titles:")
print(df["title"].head(5))

Sample titles:
0    Mindy Kaling's Daughter Had the Perfect Reacti...
1    Megan Mullally Didn't Notice the Interesting P...
2    Cast of Avengers: Infinity War Draws Their Cha...
3    YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...
4                       2017 Champions Showdown: Day 3
Name: title, dtype: object


# Structured Title Features

In [14]:
# --- Handcrafted title features ---
df["title_length"] = df["title"].apply(len)
df["word_count"] = df["title"].apply(lambda x: len(x.split()))
df["caps_ratio"] = df["title"].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
df["has_question"] = df["title"].apply(lambda x: int("?" in x))
df["has_exclamation"] = df["title"].apply(lambda x: int("!" in x))
df["has_number"] = df["title"].apply(lambda x: int(any(ch.isdigit() for ch in x)))
df["avg_word_len"] = df["title"].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)

print("✅ Added basic title features.")
df[["title", "title_length", "word_count", "caps_ratio", "has_question", "has_exclamation", "has_number", "avg_word_len"]].head()

✅ Added basic title features.


Unnamed: 0,title,title_length,word_count,caps_ratio,has_question,has_exclamation,has_number,avg_word_len
0,Mindy Kaling's Daughter Had the Perfect Reacti...,74,11,0.121622,0,0,0,5.818182
1,Megan Mullally Didn't Notice the Interesting P...,75,10,0.106667,0,0,0,6.6
2,Cast of Avengers: Infinity War Draws Their Cha...,53,8,0.132075,0,0,0,5.75
3,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,51,10,0.764706,0,1,0,4.2
4,2017 Champions Showdown: Day 3,30,5,0.1,0,0,1,5.2


# Sentiment Feature

In [15]:
# --- Sentiment analysis ---
analyzer = SentimentIntensityAnalyzer()

df["sentiment_vader"] = df["title"].apply(lambda x: analyzer.polarity_scores(x)["compound"])

print("✅ Added sentiment feature.")
df[["title", "sentiment_vader"]].head()

✅ Added sentiment feature.


Unnamed: 0,title,sentiment_vader
0,Mindy Kaling's Daughter Had the Perfect Reacti...,0.5719
1,Megan Mullally Didn't Notice the Interesting P...,-0.3089
2,Cast of Avengers: Infinity War Draws Their Cha...,-0.5994
3,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,0.5147
4,2017 Champions Showdown: Day 3,0.5267


# Structured Feature Matrix + Targets

In [16]:
# --- Structured features ---
structured_features = [
    "title_length", "word_count", "caps_ratio",
    "has_question", "has_exclamation", "has_number", "avg_word_len",
    "sentiment_vader",
    "subscribers"
]

X_structured = df[structured_features].copy()

# --- Targets ---
y_reg = df["views_per_subscriber"]       # regression target (continuous)
y_clf = (df["views_per_subscriber"] >= df["views_per_subscriber"].quantile(0.75)).astype(int)  # top 25% success

print("Structured feature matrix:", X_structured.shape)
print("Regression target shape:", y_reg.shape)
print("Classification target distribution:\n", y_clf.value_counts(normalize=True))

Structured feature matrix: (5742, 9)
Regression target shape: (5742,)
Classification target distribution:
 views_per_subscriber
0    0.749913
1    0.250087
Name: proportion, dtype: float64


In [17]:
# --- Handle outliers for regression target ---

# Option 1: Clip extreme values (simple and intuitive)
df["views_per_subscriber"] = np.clip(df["views_per_subscriber"], 0, 500)

# Option 2 (alternative): Use log transform for smoother distribution
# df["views_per_subscriber_log"] = np.log1p(df["views_per_subscriber"])

# Then redefine regression target variable to use this cleaned version
y_reg = df["views_per_subscriber"]

# (If you used log version, change above line to y_reg = df["views_per_subscriber_log"])

# Save cleaned target too, alongside others
y_reg.to_frame("views_per_subscriber").to_parquet(processed_path / "youtube_target_regression.parquet", index=False)
print("✅ Saved cleaned regression target to:", processed_path / "youtube_target_regression.parquet")

✅ Saved cleaned regression target to: /Users/jinbo/Downloads/YouTube_Clickability_Study/data/processed/youtube_target_regression.parquet


# Save Structured Datasets

In [18]:
# --- Save structured features and targets ---
X_structured.to_parquet(processed_path / "youtube_features_structured.parquet", index=False)
y_reg.to_frame("views_per_subscriber").to_parquet(processed_path / "youtube_target_regression.parquet", index=False)
y_clf.to_frame("high_clickability").to_parquet(processed_path / "youtube_target_classification.parquet", index=False)

print("✅ Saved structured features and targets.")

✅ Saved structured features and targets.


# TF-IDF Text Features (Unigrams + Bigrams)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# --- TF-IDF setup ---
tfidf = TfidfVectorizer(
    max_features=1000,
    stop_words="english",
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(df["title"])
print("Raw TF-IDF shape:", tfidf_matrix.shape)

# --- Dimensionality reduction (TruncatedSVD) ---
svd = TruncatedSVD(n_components=50, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf_matrix)

print("Reduced TF-IDF shape:", tfidf_reduced.shape)

# --- Convert to DataFrame ---
tfidf_cols = [f"tfidf_comp_{i+1}" for i in range(tfidf_reduced.shape[1])]
X_tfidf = pd.DataFrame(tfidf_reduced, columns=tfidf_cols)

X_tfidf.head()

Raw TF-IDF shape: (5742, 1000)
Reduced TF-IDF shape: (5742, 50)


Unnamed: 0,tfidf_comp_1,tfidf_comp_2,tfidf_comp_3,tfidf_comp_4,tfidf_comp_5,tfidf_comp_6,tfidf_comp_7,tfidf_comp_8,tfidf_comp_9,tfidf_comp_10,...,tfidf_comp_41,tfidf_comp_42,tfidf_comp_43,tfidf_comp_44,tfidf_comp_45,tfidf_comp_46,tfidf_comp_47,tfidf_comp_48,tfidf_comp_49,tfidf_comp_50
0,0.008286,0.00029,0.02194,0.021443,0.011097,0.001467,0.0081,0.008212,0.01314,0.016067,...,0.028178,0.106299,-0.072293,-0.078548,-0.092431,-0.123884,0.009556,0.192797,-0.016007,0.082212
1,0.00091,0.000398,0.008644,0.013543,0.014892,0.000254,-0.003213,-0.007908,0.003859,0.005647,...,0.002663,0.013645,0.000232,0.009096,-0.008013,0.003116,-0.003087,-0.004629,-0.01096,-0.01066
2,0.010907,-0.010684,0.007772,0.025401,-0.013034,-0.005131,-0.026262,-0.001171,0.00284,0.002227,...,0.258431,0.344099,0.090052,-0.014552,0.123406,0.176561,-0.254786,-0.006088,-0.020199,-0.142419
3,0.003355,-0.002931,0.005549,0.005923,0.000934,0.000944,0.002368,0.005029,0.000446,0.012689,...,-0.004002,-0.010766,0.007974,-0.003027,-0.003961,-0.000667,-0.009677,0.010092,0.010907,0.010655
4,0.020561,-0.013957,0.170305,0.198557,0.333445,-0.046106,-0.023345,-0.174501,0.076804,0.113153,...,0.008076,-0.014739,-0.00845,0.035896,-0.016587,-0.008736,-0.024485,0.01297,-0.022617,-0.001041


In [20]:
# see what each component means
terms = tfidf.get_feature_names_out()
for i, comp in enumerate(svd.components_[:5]):  # first 5 components
    top_terms = [terms[x] for x in comp.argsort()[-10:][::-1]]
    print(f"Component {i+1}: {', '.join(top_terms)}")


Component 1: official, trailer, video, official trailer, official video, hd, trailer hd, netflix, hd netflix, music
Component 2: video, official video, music, music video, official music, official, ft, lyric, lyric video, video ft
Component 3: 2018, 2017, new, vs, day, game, live, best, 10, awards
Component 4: new, 2017, star, wars, star wars, jedi, vs, wars jedi, makeup, live
Component 5: 2017, vs, best, 10, highlights, awards, nfl, game, live, makeup


# Save TF-IDF Dataset

In [21]:
# --- Save reduced TF-IDF features ---
X_tfidf.to_parquet(processed_path / "youtube_features_text.parquet", index=False)

print("✅ Saved text-based TF-IDF features to:", processed_path / "youtube_features_text.parquet")

✅ Saved text-based TF-IDF features to: /Users/jinbo/Downloads/YouTube_Clickability_Study/data/processed/youtube_features_text.parquet


# Summary Check

In [22]:
print("Structured features:", X_structured.shape)
print("Text features:", X_tfidf.shape)
print("Regression target:", y_reg.shape)
print("Classification target:", y_clf.shape)

print("\nData saved to:")
print(processed_path)

Structured features: (5742, 9)
Text features: (5742, 50)
Regression target: (5742,)
Classification target: (5742,)

Data saved to:
/Users/jinbo/Downloads/YouTube_Clickability_Study/data/processed
