Now let's predict the views and likes based on the title:

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df = pd.read_csv("../data/titles_dataset.csv")

# OPTIONAL: Filter rows where views exist
df = df[df["Views"].notnull()]

# Features and labels
X = df["Title"]
y = df["Views"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=300)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Ridge Regression (robust for high-dimensional text features)
model = Ridge(alpha=1.0)
model.fit(X_train_vec, y_train)

# Predict
y_pred = model.predict(X_test_vec)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# predictions vs actual
results_df = pd.DataFrame({
    "Title": X_test,
    "Actual Views": y_test,
    "Predicted Views": y_pred.astype(int)
})
print(results_df.head())


RMSE: 4903481.63
R^2 Score: -2.10
                                               Title  Actual Views  \
8  A Reminder To Treat Yourself Better | Jordan P...         77516   
1  Jordan Peterson - 7 Harsh Truths To Take Contr...       5647156   

   Predicted Views  
8          6730193  
1          7604233  
