In [1]:
import pandas as pd
from pathlib import Path

# Define paths
base = Path.cwd().parent
processed_path = base / "data" / "processed"

# Load processed features and labels
X = pd.read_parquet(processed_path / "youtube_features_text.parquet")
y = pd.read_parquet(processed_path / "youtube_labels.parquet")["high_clickability"]

print("✅ Loaded data")
print("X shape:", X.shape)
print("y distribution:\n", y.value_counts(normalize=True))


✅ Loaded data
X shape: (5905, 1008)
y distribution:
 high_clickability
0    0.749873
1    0.250127
Name: proportion, dtype: float64


In [None]:
stuff = pd.read_parquet(processed_path / "youtube_engagement_clean.parquet")

In [None]:
print(stuff.iloc[3416])

video_id                                                        _JY-latu82Q
trending_date                                                      18.05.01
title                     Ryan Seacrest dishes on New Year's Eve, 'Ameri...
channel_title                                          Good Morning America
category_id                                                              24
publish_time                                      2017-12-29 15:02:02+00:00
tags                      Ryan|"Seacrest"|"Dick"|"Clark's"|"New"|"Year's...
views                                                                  7662
likes                                                                    52
dislikes                                                                  4
comment_count                                                             3
thumbnail_link               https://i.ytimg.com/vi/_JY-latu82Q/default.jpg
comments_disabled                                                     False
ratings_disa

: 

In [2]:
# Split into train/test sets (80% training, 20% testing, stratified)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("✅ Split complete")
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

✅ Split complete
Train shape: (4724, 1008)
Test shape: (1181, 1008)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay

# Initialize and train
log_reg = LogisticRegression(max_iter=1000, n_jobs=-1)
log_reg.fit(X_train, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1]  # probabilities for ROC-AUC

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("✅ Logistic Regression Evaluation:")
print("Accuracy:", round(accuracy, 3))
print("Precision:", round(precision, 3))
print("Recall:", round(recall, 3))
print("F1-score:", round(f1, 3))
print("ROC-AUC:", round(roc_auc, 3))


✅ Logistic Regression Evaluation:
Accuracy: 0.993
Precision: 1.0
Recall: 0.973
F1-score: 0.986
ROC-AUC: 1.0


In [4]:
# Identify leakage columns
leak_cols = ["views", "likes", "dislikes", "comment_count", 
             "like_ratio", "comment_ratio", "views_per_subscriber", "high_clickability"]

# Keep only safe columns
X_safe = X.drop(columns=[col for col in leak_cols if col in X.columns], errors="ignore")

print("Safe feature shape:", X_safe.shape)

# Redo train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_safe, y, test_size=0.2, random_state=42, stratify=y
)

# Retrain Logistic Regression on non-leaky data
log_reg = LogisticRegression(max_iter=1000, n_jobs=-1)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print("✅ Logistic Regression (No Leakage) Evaluation:")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Precision:", round(precision_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("F1-score:", round(f1_score(y_test, y_pred), 3))
print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 3))


Safe feature shape: (5905, 1007)
✅ Logistic Regression (No Leakage) Evaluation:
Accuracy: 0.782
Precision: 0.761
Recall: 0.183
F1-score: 0.295
ROC-AUC: 0.723
