In [None]:
!pip install gdown
!pip install isodate
# Use the file ID to download
!gdown --id 1np_zI9ll5MX73YBIpYwTHCGYCBl5MqNV


In [None]:
import zipfile
import os
import numpy as np

zip_path = "/content/dataset.zip"
extract_path = "/content/dataset"

# Create the folder if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Unzip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extracted to:", extract_path)

In [None]:
videos= pd.read_csv("/content/dataset/videos.csv",parse_dates=["publishedAt"])

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from kneed import KneeLocator
import re
import numpy as np
from tqdm import tqdm

def cluster_videos_by_title(videos):
    """
    Clusters video titles using embeddings + KMeans (optimal k via elbow method).

    Args:
        videos (pd.DataFrame): DataFrame containing at least a 'title' column.

    Returns:
        videos (pd.DataFrame): Original DataFrame with new 'cluster_label' column.
        videos_cluster_summary (pd.DataFrame): Summary DataFrame with cluster stats.
    """
    # --- 1. Preprocess titles (remove hashtags) ---
    videos["clean_title"] = videos["title"].astype(str).apply(lambda x: re.sub(r"#", "", x).strip())

    # --- 2. Embed titles ---
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(videos["clean_title"].tolist(), show_progress_bar=True, batch_size=64)

    # --- 3. Standardize embeddings (helps clustering stability) ---
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings)

    # --- 4. Find optimal k using elbow method ---
    inertias = []
    K_range = range(2, 15)  # Test between 2–15 clusters

    for k in tqdm(K_range, desc="Finding optimal k"):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(embeddings_scaled)
        inertias.append(kmeans.inertia_)

    # Use KneeLocator to find "elbow"
    kl = KneeLocator(K_range, inertias, curve="convex", direction="decreasing")
    best_k = kl.knee #if kl.knee else 5  # fallback if KneeLocator fails

    print(f"Optimal number of clusters (k): {best_k}")

    # --- 5. Final clustering with best_k ---
    final_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    videos["cluster_label"] = final_kmeans.fit_predict(embeddings_scaled)

    # --- 6. Create summary DataFrame ---
    videos_cluster_summary = (
        videos.groupby("cluster_label")
        .agg(
            num_videos=("title", "count"),
            video_list=("title", lambda x: list(x))
        )
        .reset_index()
    )

    return videos, videos_cluster_summary

videos, videos_cluster_summary = cluster_videos_by_title(videos)

videos


In [None]:
# --- 1. Define the mapping from cluster label to cluster name ---
# Based on our analysis of the video titles in each cluster.
cluster_name_map = {
    0: 'Educational Skincare & Wellness',
    1: 'Viral, Entertainment & Pop Culture',
    2: 'Hair Styling & Transformations',
    3: 'General Beauty for the Shorts Feed',
    4: 'Makeup Tutorials & Challenges'
}

# --- 2. Create the new 'cluster_name' column ---
# The .map() function will look at each value in 'cluster_label' and
# replace it with the corresponding value from our dictionary.
videos['cluster_name'] = videos['cluster_label'].map(cluster_name_map)


# --- 3. (Optional) Verify the result ---
# This will show you the first few rows with the new column
print("DataFrame with the new 'cluster_name' column:")
print(videos[['title', 'cluster_label', 'cluster_name']].head())

print("\n--------------------------------------------------\n")

# This will show you the counts for each new cluster name, confirming the mapping worked.
print("Value counts for the new cluster names:")
videos

In [None]:
videos = videos[videos["cluster_name"]!="Viral, Entertainment & Pop Culture"]
videos = videos.reset_index(drop=True)

mask = videos["cluster_name"] == "Educational Skincare & Wellness"
videos.loc[mask] = (
    videos[mask]
    .drop_duplicates(subset=["title"], keep="first")
)

videos = videos.dropna(subset=["cluster_name"]).reset_index(drop=True)

In [None]:
videos["cluster_name"].unique()

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from kneed import KneeLocator
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def add_subclusters(videos: pd.DataFrame) -> pd.DataFrame:
    """
    Adds a 'subcluster' column to the videos dataframe.
    Each subcluster is named as '{cluster_name}_{subcluster_num}'.
    """

    # Make a copy so we don't modify in place
    videos = videos.copy()
    videos["subcluster"] = None

    # Mapping: cluster_name -> model
    model_map = {
        "Hair Styling & Transformations": "sentence-transformers/all-MiniLM-L6-v2",
        "Makeup Tutorials & Challenges": "BAAI/bge-base-en-v1.5",
        "Educational Skincare & Wellness": "intfloat/e5-base-v2",
        "General Beauty for the Shorts Feed": "intfloat/e5-base-v2",
    }

    for cluster_name in videos["cluster_name"].dropna().unique():
        cluster_data = videos[videos["cluster_name"] == cluster_name].copy()


        # Pick the right model
        model_name = model_map.get(cluster_name, "sentence-transformers/all-MiniLM-L6-v2")
        model = SentenceTransformer(model_name)

        # Embed
        embeddings = model.encode(cluster_data["title"].tolist(), show_progress_bar=True)

        # --- Elbow method to find optimal k ---
        inertia = []
        K = range(2, min(15, len(cluster_data)))
        for k in K:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(embeddings)
            inertia.append(kmeans.inertia_)

        kn = KneeLocator(K, inertia, curve="convex", direction="decreasing")
        optimal_k = kn.knee or 3

        # --- Final KMeans ---
        final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        cluster_labels = final_kmeans.fit_predict(embeddings)

        # Assign back into main dataframe
        subcluster_labels = [f"{cluster_name}_{i}" for i in cluster_labels]
        videos.loc[cluster_data.index, "subcluster"] = subcluster_labels

        print(f"📊 Cluster '{cluster_name}' split into {optimal_k} subclusters")

    return videos

videos = add_subclusters(videos)
videos

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm  # <-- add tqdm

def reassign_general_beauty(videos: pd.DataFrame) -> pd.DataFrame:
    """
    Reassign misclassified 'General Beauty for the Shorts Feed' subclusters
    into correct parent clusters and subclusters.
    """
    model = SentenceTransformer("intfloat/e5-base-v2")

    # mapping of misclustered subclusters -> target parent cluster
    mapping = {
        "General Beauty for the Shorts Feed_1": "Makeup Tutorials & Challenges",
        "General Beauty for the Shorts Feed_4": "Makeup Tutorials & Challenges",
        "General Beauty for the Shorts Feed_2": "Hair Styling & Transformations",
        "General Beauty for the Shorts Feed_5": "Educational Skincare & Wellness"
    }

    # loop over each mapping
    for bad_sub, correct_cluster in mapping.items():
        target_rows = videos[videos["subcluster"] == bad_sub]
        if target_rows.empty:
            continue

        # reference rows from the correct cluster
        ref_rows = videos[videos["cluster_name"] == correct_cluster]

        # embed titles
        ref_embeddings = model.encode(ref_rows["title"].tolist(), convert_to_tensor=True)
        target_embeddings = model.encode(target_rows["title"].tolist(), convert_to_tensor=True)

        # assign based on cosine similarity (with tqdm progress bar)
        for i, (idx, row) in enumerate(tqdm(target_rows.iterrows(),
                                            total=len(target_rows),
                                            desc=f"Reassigning {bad_sub} → {correct_cluster}")):
            sims = util.cos_sim(target_embeddings[i], ref_embeddings)[0].cpu().numpy()
            best_match_idx = np.argmax(sims)
            best_subcluster = ref_rows.iloc[best_match_idx]["subcluster"]

            # update dataframe
            videos.at[idx, "subcluster"] = best_subcluster
            videos.at[idx, "cluster_name"] = correct_cluster

    return videos

videos = reassign_general_beauty(videos)
videos

In [None]:
good_subclusters = [
    "Hair Styling & Transformations_0","Hair Styling & Transformations_1","Hair Styling & Transformations_3",
    "Hair Styling & Transformations_4","Hair Styling & Transformations_6",
    "Makeup Tutorials & Challenges_1","Makeup Tutorials & Challenges_2","Makeup Tutorials & Challenges_3",
    "Makeup Tutorials & Challenges_4",
    "Educational Skincare & Wellness_0","Educational Skincare & Wellness_1","Educational Skincare & Wellness_2"
]

videos["publishedAt"] = pd.to_datetime(videos["publishedAt"])
videos = videos[videos["publishedAt"] < "2025-07-01"]
# filter dataframe
videos = videos[videos["subcluster"].isin(good_subclusters)].reset_index(drop=True)

In [None]:
def top_videos_by_cluster(videos: pd.DataFrame, cluster_name: str, top_n: int = 100):
    """
    For a given cluster_name, returns the top N videos (by view_velocity)
    for each subcluster inside that cluster.
    """
    # Filter videos for this cluster only
    cluster_data = videos[videos["cluster_name"] == cluster_name].copy()

    if cluster_data.empty:
        print(f"⚠️ No videos found for cluster '{cluster_name}'")
        return {}

    results = {}
    for sub in sorted(cluster_data["subcluster"].unique()):
        sub_data = cluster_data[cluster_data["subcluster"] == sub].copy()

        # Sort by view_velocity (descending)
        top_videos = (
            sub_data.sort_values("view_velocity", ascending=False)
            .head(top_n)[["videoId", "clean_title", "view_velocity", "subcluster"]]
        )

        print(f"\n🔥 Top {top_n} Videos in {sub}:")
        print(top_videos.to_string(index=False))

        results[sub] = top_videos

    return results


'''
array(['Educational Skincare & Wellness', 'Makeup Tutorials & Challenges',
       'General Beauty for the Shorts Feed', nan,
       'Hair Styling & Transformations'], dtype=object)
'''

# Assuming you already ran add_subclusters(videos)
top_videos = top_videos_by_cluster(videos, "Educational Skincare & Wellness", top_n=100)



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import itertools

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

def calculate_mape(y_true, y_pred):
    """Robust MAPE calculation that ignores zero actuals."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_mask = y_true != 0
    if not np.any(non_zero_mask):
        return 0.0 if np.all(y_pred == 0) else np.inf
    percentage_error = np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])
    return np.mean(percentage_error) * 100

def handle_anomalies(series, trend, seasonal, seasonal_periods, sigma=3.0):
    """
    Detects and corrects anomalies in a time series based on Holt-Winters.

    Anomalies are points that fall outside a confidence band defined by the
    model's fitted values plus/minus a multiple (sigma) of the residual standard deviation.
    """
    # Fit a model to the series to establish a baseline
    model = ExponentialSmoothing(
        series,
        trend=trend,
        seasonal=seasonal,
        seasonal_periods=seasonal_periods
    ).fit(optimized=True)

    # Get the in-sample forecast (fitted values)
    fitted_values = model.fittedvalues

    # Calculate residuals (errors)
    residuals = series - fitted_values

    # Calculate the confidence interval bounds
    residual_std = np.std(residuals)
    upper_bound = fitted_values + sigma * residual_std
    lower_bound = fitted_values - sigma * residual_std

    # Identify anomalies
    anomalies_upper = series > upper_bound
    anomalies_lower = series < lower_bound

    # Correct anomalies by replacing them with the boundary values
    corrected_series = series.copy().astype(float)
    corrected_series[anomalies_upper] = upper_bound[anomalies_upper]
    corrected_series[anomalies_lower] = lower_bound[anomalies_lower]

    return corrected_series


def holt_winters_rollingcv(videos, subcluster, min_train="2020-01", seasonal_range=(2, 31), forecast_horizon=6, anomaly_sigma=3.0):
    """
    Holt-Winters with anomaly handling and rolling cross-validation for multi-step forecasting.

    Anomaly Handling:
      - Before training each CV fold, anomalies in the training data are detected and corrected.
      - The seasonal parameter for anomaly detection is the SAME as the one being tested for forecasting.

    Parameters:
      - trend: ['add', 'mul']
      - seasonal: ['add', 'mul']
      - seasonal_periods: integers in seasonal_range
      - forecast_horizon: how many months to forecast at each step (default=6)
      - anomaly_sigma: The sensitivity for anomaly detection (default=3.0, i.e., 3 standard deviations).
    """

    # --- 1. Data Preparation ---
    subcluster_data = videos[videos["subcluster"] == subcluster].copy()
    subcluster_data["publishedAt"] = pd.to_datetime(subcluster_data["publishedAt"])

    monthly_counts = (
        subcluster_data
        .groupby(pd.Grouper(key="publishedAt", freq="M"))["videoId"]
        .count()
        .reset_index()
        .rename(columns={"videoId": "video_count"})
    )

    if monthly_counts.empty:
        return None

    monthly_counts = monthly_counts.set_index("publishedAt").asfreq("M", fill_value=0)
    monthly_counts = monthly_counts.loc[min_train:]

    if len(monthly_counts) < seasonal_range[1] * 2:
        print(f"Not enough data for subcluster {subcluster}. Skipping.")
        return None

    # --- 2. Parameter grid ---
    param_grid = {
        'trend': ['add', 'mul'],
        'seasonal': ['add', 'mul'],
        'seasonal_periods': list(range(seasonal_range[0], seasonal_range[1] + 1))
    }
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

    results = []

    # --- 3. Rolling CV loop ---
    for params in all_params:
        mape_scores = []

        if len(monthly_counts) < params['seasonal_periods'] * 2:
            continue

        # rolling steps — forecast horizon must fit
        for t in range(params['seasonal_periods'] * 2, len(monthly_counts) - forecast_horizon + 1):
            train = monthly_counts.iloc[:t]
            test = monthly_counts.iloc[t:t+forecast_horizon]

            try:
                # --- ANOMALY HANDLING STEP ---
                # Clean the training data before fitting the final model
                cleaned_train_series = handle_anomalies(
                    train["video_count"],
                    trend=params['trend'],
                    seasonal=params['seasonal'],
                    seasonal_periods=params['seasonal_periods'],
                    sigma=anomaly_sigma
                )

                # Fit the model on the CLEANED data
                model = ExponentialSmoothing(
                    cleaned_train_series, # Use the cleaned series here
                    trend=params['trend'],
                    seasonal=params['seasonal'],
                    seasonal_periods=params['seasonal_periods']
                ).fit(optimized=True)

                # Forecast and evaluate against the original, untouched test data
                forecast = model.forecast(forecast_horizon)
                mape = calculate_mape(test["video_count"], forecast)
                mape_scores.append(mape)

            except Exception:
                continue

        if mape_scores:
            avg_mape = np.mean(mape_scores)
            results.append({
                "subcluster": subcluster,
                "trend": params['trend'],
                "seasonal": params['seasonal'],
                "seasonal_periods": params['seasonal_periods'],
                "avg_mape": avg_mape
            })

    if not results:
        return None

    return pd.DataFrame(results)


# --- EXAMPLE USAGE ---
# Assuming 'videos' DataFrame is loaded
results_list = []

print("Starting rolling CV with ANOMALY HANDLING for all subclusters (6-month horizon)...")
for i in videos["subcluster"].dropna().unique():
    print(f"\n--- Processing Subcluster: {i} ---")
    # You can adjust anomaly_sigma here if needed, e.g., anomaly_sigma=2.5 for higher sensitivity
    df_results = holt_winters_rollingcv(videos, i, seasonal_range=(2, 31), forecast_horizon=6, anomaly_sigma=3.0)
    if df_results is not None:
        best_row = df_results.sort_values("avg_mape").iloc[0]
        results_list.append(best_row)
        print(f"✅ Best 6M MAPE {best_row['avg_mape']:.2f}% with params "
              f"trend={best_row['trend']}, seasonal={best_row['seasonal']}, "
              f"period={best_row['seasonal_periods']}")
    else:
        print(f"❌ Skipped {i} (not enough data or model failure)")

forecasting_summary_df = pd.DataFrame(results_list).reset_index(drop=True)
print("\n\n--- Forecasting Summary with Anomaly Handling (Best per Subcluster, 6-month horizon) ---")
if not forecasting_summary_df.empty:
    display(forecasting_summary_df.sort_values(by="avg_mape"))
else:
    print("No models could be successfully trained.")