In [1]:
import pandas as pd

df = pd.read_csv("dataset/USvideos.csv")
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


In [2]:
# Filtering out videos that have ratings disabled or have no likes
df = df[df["ratings_disabled"] == False]
df = df[df["likes"] > 0]

In [3]:
import string
import json

from sklearn.model_selection import train_test_split

with open("dataset/US_category_id.json") as f:
    data = json.load(f)
    categories = {int(item["id"]): item["snippet"]["title"] for item in data["items"]}


def remove_punc_and_lower(words: str):
    cleaned = "".join(c for c in words.lower() if c not in string.punctuation)
    return cleaned


df["category"] = df["category_id"].map(categories)

df["title"] = df["title"].apply(remove_punc_and_lower)

df["tags"] = df["tags"].apply(lambda x: " ".join(x.split("|")))
df["tags"] = df["tags"].apply(lambda x: "".join(x.split('"')))
df["tags"] = df["tags"].apply(remove_punc_and_lower)

df["description"] = df["description"].fillna("")
df["description"] = df["description"].apply(remove_punc_and_lower)

df["publish_time"] = pd.to_datetime(df["publish_time"]).dt.tz_convert("US/Pacific")

df["day_of_week"] = df["publish_time"].dt.day_name()

df["dislikes"] = df.apply(
    lambda row: row["dislikes"] if row["dislikes"] > 0 else 1, axis=1
)
df["likes_dislikes_ratio"] = df["likes"] / df["dislikes"]

feature_cols = ["title", "tags", "description", "category", "day_of_week"]
X = df[feature_cols]
y = df["likes_dislikes_ratio"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
X_train.head()

Unnamed: 0,title,tags,description,category,day_of_week
5180,diy pizza pouch wearable pizza,pizza pouch the pizza pouch the pizza bag pizz...,pizza pouches might be the most practical thin...,Entertainment,Sunday
19960,trace cyrus brenda official lyric video,trace cyrus brenda song metro station shake it...,happy valentines day 💔💕🖤💘❤️,People & Blogs,Wednesday
18736,why i left nickelodeon,nickelodeon nick butch hartman why i left buzz...,after 20 years of creating shows like fairly o...,Film & Animation,Thursday
1516,sia ho ho ho,sia ho ho ho holiday,“ho ho ho from everyday is christmas out ever...,Music,Thursday
1119,ranz and niana goes to la carpool around,ranz ranz kyle niana niana guerrero ranz niana...,so we were invited to watch the premiere and a...,People & Blogs,Thursday


In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


# TF-IDF Vectorization
tfidf_title = TfidfVectorizer(
    max_features=500, stop_words="english", ngram_range=(1, 3)
)
tfidf_tags = TfidfVectorizer(max_features=300, stop_words="english", ngram_range=(1, 3))
tfidf_description = TfidfVectorizer(
    max_features=700, stop_words="english", ngram_range=(1, 3)
)

X_train_title = tfidf_title.fit_transform(X_train["title"]).toarray()
X_train_tags = tfidf_tags.fit_transform(X_train["tags"]).toarray()
X_train_description = tfidf_description.fit_transform(X_train["description"]).toarray()

X_test_title = tfidf_title.transform(X_test["title"]).toarray()
X_test_tags = tfidf_tags.transform(X_test["tags"]).toarray()
X_test_description = tfidf_description.transform(X_test["description"]).toarray()

# Categorical Features
category_encoder = OneHotEncoder(sparse_output=False)
category_train = category_encoder.fit_transform(X_train[["category"]])
category_test = category_encoder.transform(X_test[["category"]])

# Temporal Features
day_encoder = OneHotEncoder(sparse_output=False)
day_of_week_train = day_encoder.fit_transform(X_train[["day_of_week"]])
day_of_week_test = day_encoder.transform(X_test[["day_of_week"]])


X_train_combined = np.hstack(
    [
        X_train_title,
        X_train_tags,
        X_train_description,
        category_train,
        day_of_week_train,
    ]
)
X_test_combined = np.hstack(
    [
        X_test_title,
        X_test_tags,
        X_test_description,
        category_test,
        day_of_week_test,
    ]
)

In [6]:
import nltk
import gensim.downloader as api
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab")

word2vec_model = api.load("word2vec-google-news-300")


def text_to_avg_w2v(text):
    tokens = word_tokenize(text)
    return get_average_word2vec(tokens, word2vec_model)


def get_average_word2vec(tokens, model, vector_size=300):
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)


X_train_title_w2v = np.array([text_to_avg_w2v(text) for text in X_train["title"]])
X_train_tags_w2v = np.array([text_to_avg_w2v(text) for text in X_train["tags"]])
X_train_description_w2v = np.array(
    [text_to_avg_w2v(text) for text in X_train["description"]]
)
X_train_w2v = np.hstack(
    [
        X_train_title_w2v,
        X_train_tags_w2v,
        X_train_description_w2v,
    ]
)

X_test_title_w2v = np.array([text_to_avg_w2v(text) for text in X_test["title"]])
X_test_tags_w2v = np.array([text_to_avg_w2v(text) for text in X_test["tags"]])
X_test_description_w2v = np.array(
    [text_to_avg_w2v(text) for text in X_test["description"]]
)
X_test_w2v = np.hstack(
    [
        X_test_title_w2v,
        X_test_tags_w2v,
        X_test_description_w2v,
    ]
)

X_train_combined = np.hstack(
    [
        X_train_combined,
        X_train_w2v,
    ]
)
X_test_combined = np.hstack(
    [
        X_test_combined,
        X_test_w2v,
    ]
)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aaronang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_combined = scaler.fit_transform(X_train_combined)
X_test_combined = scaler.transform(X_test_combined)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

lr = LinearRegression()
lr.fit(X_train_combined, y_train)
predictions = lr.predict(X_test_combined)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

MSE: 1438.1497, MAE: 22.9766, R2: 0.5401


In [9]:
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

gbr = HistGradientBoostingRegressor(random_state=42)
gbr.fit(X_train_combined, y_train)
predictions = gbr.predict(X_test_combined)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

MSE: 672.9047, MAE: 14.0239, R2: 0.7848


In [10]:
from xgboost import XGBRegressor

bst = XGBRegressor(random_state=42)
bst.fit(X_train_combined, y_train)
predictions = bst.predict(X_test_combined)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

MSE: 353.2444, MAE: 10.4837, R2: 0.8870


In [11]:
rfr = RandomForestRegressor(random_state=42, n_jobs=-1)
rfr.fit(X_train_combined, y_train)
predictions = rfr.predict(X_test_combined)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

MSE: 227.2140, MAE: 5.2698, R2: 0.9273
