# Setup & Paths

In [None]:
# --- Setup ---
import pandas as pd
import numpy as np
from pathlib import Path

from _init_path import *

# --- Paths ---
base = Path.cwd().parent          # assumes this notebook is in /notebooks
raw_path = base / "data" / "raw"
processed_path = base / "data" / "processed"
processed_path.mkdir(parents=True, exist_ok=True)

# Load Raw Data

In [None]:
# --- Load raw data ---
df_raw = pd.read_csv(raw_path / "USvideos.csv")
df_subs = pd.read_csv(raw_path / "USvideos_with_subscribers.csv")

print("Raw main shape:", df_raw.shape)
print("Raw subs shape:", df_subs.shape)

# Basic Cleaning & Types

In [None]:
# --- Clean numeric types ---
for col in ["views", "likes", "dislikes", "comment_count"]:
    df_raw[col] = pd.to_numeric(df_raw[col], errors="coerce")

# --- Timestamps ---
df_raw["publish_time"] = pd.to_datetime(df_raw["publish_time"], errors="coerce")

# --- Drop rows missing core info ---
core_cols = ["video_id", "title", "channel_title", "views", "likes", "comment_count", "publish_time"]
df_raw = df_raw.dropna(subset=core_cols)

print("After core cleaning:", df_raw.shape)

# Merge Subscribers

In [None]:
# --- Keep only needed cols from subscribers file ---
df_subs_small = df_subs[["video_id", "subscriber"]].copy()

# --- Merge on video_id ---
df = pd.merge(
    df_raw,
    df_subs_small,
    on="video_id",
    how="left"
)

print("After merge shape:", df.shape)
print("Missing subscribers %:",
      round(df["subscriber"].isna().mean() * 100, 2))

# Rename column for clarity
df = df.rename(columns={"subscriber": "subscribers"})


# Deduplicate by Video & Impute Subscribers

In [None]:
# --- Deduplicate: keep row with max views per video_id ---
before = len(df)
df = df.loc[df.groupby("video_id")["views"].idxmax()].copy()
after = len(df)

print("Deduped by video_id: {} -> {}".format(before, after))

# --- Impute subscribers with channel-level mean ---
df["subscribers"] = df.groupby("channel_title")["subscribers"].transform(
    lambda x: x.fillna(x.mean())
)

# --- Drop rows where subscribers still missing ---
df = df.dropna(subset=["subscribers"])
print("After subscriber imputation/drop:", df.shape)

# Compute Views-per-Subscriber & Filter Outliers

In [None]:
# --- Compute views_per_subscriber ---
df["views_per_subscriber"] = df["views"] / (df["subscribers"] + 1)

# --- Basic stats before filtering ---
print("VPS before filtering:")
print(df["views_per_subscriber"].describe(percentiles=[0.5, 0.75, 0.9, 0.99]))

# --- Filter: remove tiny channels and extreme VPS outliers ---
min_subscribers = 50          # you can tweak if needed
max_vps = 1000                # cap extreme ratios

mask = (df["subscribers"] >= min_subscribers) & (df["views_per_subscriber"] <= max_vps)
df = df[mask].copy()

print("After filters (subs >= {}, VPS <= {}):".format(min_subscribers, max_vps), df.shape)

# --- Log-transform for future regression target ---
df["views_per_subscriber_log"] = np.log1p(df["views_per_subscriber"])


print("Final VPS log stats:")
print(df["views_per_subscriber_log"].describe())


# Save Clean Final Dataset

In [None]:
out_path = processed_path / "youtube_clean_final.parquet"
df.to_parquet(out_path, index=False)

print("âœ… Saved cleaned dataset to:", out_path)
print("Rows:", len(df), "Columns:", len(df.columns))

# Quick sanity check
df_check = pd.read_parquet(out_path)
print("Loaded back:", df_check.shape)
df_check.head()