In [None]:
# --- Setup ---
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# --- Load cleaned dataset ---
base = Path.cwd().parent
data_path = base / "data" / "processed" / "youtube_engagement_clean.parquet"

df = pd.read_parquet(data_path)
print("Loaded dataset with", len(df), "rows and", len(df.columns), "columns.")
# df.head()

Loaded dataset with 5905 rows and 17 columns.


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,subscriber
0,-0CMnp02rNY,18.11.06,Mindy Kaling's Daughter Had the Perfect Reacti...,TheEllenShow,24,2018-06-04 13:00:00+00:00,"ellen|""ellen degeneres""|""the ellen show""|""elle...",800359,9773,332,423,https://i.ytimg.com/vi/-0CMnp02rNY/default.jpg,False,False,False,Ocean's 8 star Mindy Kaling dished on bringing...,23760020.0
1,-0NYY8cqdiQ,18.01.02,Megan Mullally Didn't Notice the Interesting P...,TheEllenShow,24,2018-01-29 14:00:39+00:00,"megan mullally|""megan""|""mullally""|""will and gr...",563746,4429,54,94,https://i.ytimg.com/vi/-0NYY8cqdiQ/default.jpg,False,False,False,Ellen and Megan Mullally have known each other...,23760020.0
2,-1Hm41N0dUs,18.01.05,Cast of Avengers: Infinity War Draws Their Cha...,Jimmy Kimmel Live,23,2018-04-27 07:30:02+00:00,"jimmy|""jimmy kimmel""|""jimmy kimmel live""|""late...",2058516,41248,580,1484,https://i.ytimg.com/vi/-1Hm41N0dUs/default.jpg,False,False,False,"Benedict Cumberbatch, Don Cheadle, Elizabeth O...",11262900.0
3,-1yT-K3c6YI,17.02.12,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,Molly Burke,22,2017-11-28 18:30:43+00:00,"youtube quiz|""youtuber quiz""|""truth or dare""|""...",231341,7734,212,846,https://i.ytimg.com/vi/-1yT-K3c6YI/default.jpg,False,False,False,Check out the video we did on the Merrell Twin...,274004.0
4,-2RVw2_QyxQ,17.16.11,2017 Champions Showdown: Day 3,Saint Louis Chess Club,27,2017-11-12 02:39:01+00:00,"Chess|""Saint Louis""|""Club""",71089,460,27,20,https://i.ytimg.com/vi/-2RVw2_QyxQ/default.jpg,False,False,False,The Saint Louis Chess Club hosts a series of f...,147718.0


In [10]:
# Compute views per subscriber
df["views_per_subscriber"] = df["views"] / (df["subscriber"] + 1)

# Basic stats
# df["views_per_subscriber"].describe(percentiles=[0.25, 0.5, 0.75, 0.9])

stats = df["views_per_subscriber"].describe(percentiles=[0.25, 0.5, 0.75, 0.9])
stats = stats.rename({
    "count": "Count",
    "mean": "Mean",
    "std": "Std Dev",
    "min": "Min",
    "25%": "25th Percentile",
    "50%": "Median (50th)",
    "75%": "75th Percentile",
    "90%": "90th Percentile",
    "max": "Max"
})

# Print with nice formatting
for name, val in stats.items():
    print(f"{name:20s}: {val:,.3f}")


Count               : 5,905.000
Mean                : 15,140.266
Std Dev             : 335,762.263
Min                 : 0.001
25th Percentile     : 0.107
Median (50th)       : 0.321
75th Percentile     : 1.124
90th Percentile     : 8.881
Max                 : 19,669,098.000


In [13]:
# Define threshold (top 25%)
q75 = df["views_per_subscriber"].quantile(0.75)
df["high_clickability"] = (df["views_per_subscriber"] >= q75).astype(int)

# Check balance of classes
print(df["high_clickability"].value_counts(normalize=True))

high_clickability
0    0.749873
1    0.250127
Name: proportion, dtype: float64


In [11]:
df[["title", "views", "subscriber", "views_per_subscriber", "high_clickability"]].head(10)


Unnamed: 0,title,views,subscriber,views_per_subscriber,high_clickability
0,Mindy Kaling's Daughter Had the Perfect Reacti...,800359,23760020.0,0.033685,0
1,Megan Mullally Didn't Notice the Interesting P...,563746,23760020.0,0.023727,0
2,Cast of Avengers: Infinity War Draws Their Cha...,2058516,11262900.0,0.18277,0
3,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,231341,274004.0,0.844295,0
4,2017 Champions Showdown: Day 3,71089,147718.0,0.481245,0
5,Benedict Cumberbatch's Tom Holland impression ...,2390558,4551034.0,0.525278,0
6,Ex-UFO program chief: We may not be alone,291653,3095131.0,0.09423,0
7,Top 10 Moments of the NBA All-Star Celebrity Game,1036300,8707071.0,0.119018,0
8,Kygo - Stranger Things ft. OneRepublic (Alan W...,2425578,12672160.0,0.19141,0
9,Christmas Day 2000,3170,1014.0,3.123153,1


In [None]:
processed_path = base / "data" / "processed"
out_path = processed_path / "youtube_with_target.parquet"
df.to_parquet(out_path, index=False)

print("✅ Saved dataset with target column to:", out_path)

✅ Saved dataset with target column to: c:\Users\johnr\OneDrive\Desktop\ML Project\YouTube_Clickability_Study\data\processed\youtube_with_target.parquet


In [14]:
# --- TEXT-BASED TITLE FEATURES ---

# Basic text cleaning (lowercasing, ensure string type)
df["title"] = df["title"].astype(str)

# 1️⃣ Title length (number of characters)
df["title_length"] = df["title"].apply(len)

# 2️⃣ Word count
df["word_count"] = df["title"].apply(lambda x: len(x.split()))

# 3️⃣ Ratio of capital letters to total length
df["caps_ratio"] = df["title"].apply(
    lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0
)

# 4️⃣ Has question mark
df["has_question"] = df["title"].apply(lambda x: int("?" in x))

# 5️⃣ Has exclamation mark
df["has_exclamation"] = df["title"].apply(lambda x: int("!" in x))

# 6️⃣ Has a number
df["has_number"] = df["title"].apply(lambda x: int(any(ch.isdigit() for ch in x)))

# 7️⃣ Average word length
df["avg_word_len"] = df["title"].apply(
    lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0
)

print("✅ Added title-based features.")
df[
    ["title", "title_length", "word_count", "caps_ratio",
     "has_question", "has_exclamation", "has_number", "avg_word_len"]
].head(10)


✅ Added title-based features.


Unnamed: 0,title,title_length,word_count,caps_ratio,has_question,has_exclamation,has_number,avg_word_len
0,Mindy Kaling's Daughter Had the Perfect Reacti...,74,11,0.121622,0,0,0,5.818182
1,Megan Mullally Didn't Notice the Interesting P...,75,10,0.106667,0,0,0,6.6
2,Cast of Avengers: Infinity War Draws Their Cha...,53,8,0.132075,0,0,0,5.75
3,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,51,10,0.764706,0,1,0,4.2
4,2017 Champions Showdown: Day 3,30,5,0.1,0,0,1,5.2
5,Benedict Cumberbatch's Tom Holland impression ...,57,7,0.192982,0,0,0,7.285714
6,Ex-UFO program chief: We may not be alone,41,8,0.121951,0,0,0,4.25
7,Top 10 Moments of the NBA All-Star Celebrity Game,49,9,0.183673,0,0,1,4.555556
8,Kygo - Stranger Things ft. OneRepublic (Alan W...,58,9,0.137931,0,0,0,5.555556
9,Christmas Day 2000,18,3,0.111111,0,0,1,5.333333


In [15]:
df[["title_length", "word_count", "caps_ratio", "avg_word_len"]].describe()


Unnamed: 0,title_length,word_count,caps_ratio,avg_word_len
count,5905.0,5905.0,5905.0,5905.0
mean,49.732261,8.699915,0.20553,4.947815
std,19.55868,3.488372,0.173468,1.081067
min,4.0,1.0,0.0,2.0
25%,36.0,6.0,0.125,4.272727
50%,48.0,8.0,0.153846,4.833333
75%,62.0,11.0,0.2,5.5
max,100.0,23.0,0.947368,36.0


In [16]:
# --- TF-IDF TITLE FEATURES ---

from sklearn.feature_extraction.text import TfidfVectorizer

# Configure vectorizer
tfidf = TfidfVectorizer(
    max_features=1000,        # top 1000 most frequent words
    stop_words='english',     # remove English stopwords
    ngram_range=(1, 2)        # include unigrams and bigrams
)

# Fit + transform
tfidf_matrix = tfidf.fit_transform(df["title"])

# Convert to DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

print("✅ TF-IDF shape:", tfidf_df.shape)
tfidf_df.head()


✅ TF-IDF shape: (5905, 1000)


Unnamed: 0,000,10,10 plays,100,11,12,13,14,15,16,...,year,year old,years,yiay,york,young,youtube,youtubers,youtubers react,방탄소년단
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Combine TF-IDF with numeric features
# Merge handcrafted features + label (high_clickability) + TF-IDF into one big matrix.

# Select numeric/manual features
manual_features = [
    "title_length", "word_count", "caps_ratio",
    "has_question", "has_exclamation", "has_number",
    "avg_word_len", "views_per_subscriber"  # optional contextual numeric
]

X_manual = df[manual_features].reset_index(drop=True)
y = df["high_clickability"].reset_index(drop=True)

# Combine manual + TF-IDF
X = pd.concat([X_manual, tfidf_df], axis=1)
print("✅ Combined feature matrix shape:", X.shape)

✅ Combined feature matrix shape: (5905, 1008)


In [18]:
print("Manual feature count:", len(manual_features))
print("TF-IDF feature count:", tfidf_df.shape[1])
print("Total:", X.shape[1])


Manual feature count: 8
TF-IDF feature count: 1000
Total: 1008


In [20]:
from textblob import TextBlob

df["sentiment"] = df["title"].apply(lambda x: TextBlob(x).sentiment.polarity)
print("✅ Added sentiment feature.")
df[["title", "sentiment"]].head(10)

✅ Added sentiment feature.


Unnamed: 0,title,sentiment
0,Mindy Kaling's Daughter Had the Perfect Reacti...,1.0
1,Megan Mullally Didn't Notice the Interesting P...,0.5
2,Cast of Avengers: Infinity War Draws Their Cha...,0.0
3,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,0.0
4,2017 Champions Showdown: Day 3,0.0
5,Benedict Cumberbatch's Tom Holland impression ...,1.0
6,Ex-UFO program chief: We may not be alone,0.0
7,Top 10 Moments of the NBA All-Star Celebrity Game,0.05
8,Kygo - Stranger Things ft. OneRepublic (Alan W...,0.0
9,Christmas Day 2000,0.0


In [21]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Create analyzer
analyzer = SentimentIntensityAnalyzer()

# Apply to each title
df["sentiment_vader"] = df["title"].apply(lambda x: analyzer.polarity_scores(str(x))["compound"])

print("✅ Added VADER sentiment feature.")
df[["title", "sentiment_vader"]].head(10)


✅ Added VADER sentiment feature.


Unnamed: 0,title,sentiment_vader
0,Mindy Kaling's Daughter Had the Perfect Reacti...,0.5719
1,Megan Mullally Didn't Notice the Interesting P...,-0.3089
2,Cast of Avengers: Infinity War Draws Their Cha...,-0.5994
3,YOUTUBER QUIZ + TRUTH OR DARE W/ THE MERRELL T...,0.5147
4,2017 Champions Showdown: Day 3,0.5267
5,Benedict Cumberbatch's Tom Holland impression ...,0.7456
6,Ex-UFO program chief: We may not be alone,0.1877
7,Top 10 Moments of the NBA All-Star Celebrity Game,0.2023
8,Kygo - Stranger Things ft. OneRepublic (Alan W...,0.0
9,Christmas Day 2000,0.0


In [23]:
df["publish_hour"] = df["publish_time"].dt.hour
df["publish_dayofweek"] = df["publish_time"].dt.dayofweek
df["is_weekend"] = df["publish_dayofweek"].isin([5, 6]).astype(int)


In [24]:
from pathlib import Path

# Define save directory
processed_path = base / "data" / "processed"
processed_path.mkdir(parents=True, exist_ok=True)

# Save features (X) and labels (y)
X.to_parquet(processed_path / "youtube_features_text.parquet", index=False)
y.to_frame("high_clickability").to_parquet(processed_path / "youtube_labels.parquet", index=False)

print("✅ Saved feature and label files to:")
print("  -", processed_path / "youtube_features_text.parquet")
print("  -", processed_path / "youtube_labels.parquet")


✅ Saved feature and label files to:
  - c:\Users\johnr\OneDrive\Desktop\ML Project\YouTube_Clickability_Study\data\processed\youtube_features_text.parquet
  - c:\Users\johnr\OneDrive\Desktop\ML Project\YouTube_Clickability_Study\data\processed\youtube_labels.parquet
