In [6]:
# 1) Imports & load your artifacts
import pytz
from datetime import datetime
import pandas as pd
import numpy as np
import os
import logging
import joblib
import re
import plotly.express as px
import plotly.graph_objects as go
import shap
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator
from sklearn.preprocessing import OneHotEncoder



from predictor import (
    get_predictor_artifacts,
    _infer_grid_for_game,
    _get_last_row_for_stream,
)
_ARTIFACT_PATH = os.path.join(os.getcwd(), "predictor_artifacts.joblib")
DEFAULT_START_TIMES   = list(range(24))     # 0..23 hours
DEFAULT_DURATIONS_HRS = list(range(2, 8))  # 2..12 hours
DEFAULT_DURATIONS_HRS  = [h * 60 for h in DEFAULT_DURATIONS_HRS]
DEFAULT_DURATIONS_HRS = [4]

if os.path.exists(_ARTIFACT_PATH):
    try:
        data = joblib.load(_ARTIFACT_PATH)
        df_inf = data.get("df_for_inf")
        df_inf['game_category'] = df_inf['game_category'].str.lower()
        if isinstance(df_inf, pd.DataFrame):
            df_inf.columns = df_inf.columns.map(str)
        pipes = data.get("pipelines", [])
        df = data.get("df_for_inf")
        features = data.get("features")
        cat_opts = data.get("stream_category_options_inf")
        start_opts = data.get("optional_start_times", DEFAULT_START_TIMES)
        dur_opts = data.get("stream_duration_opts", DEFAULT_DURATIONS_HRS)
        metrics_list = data.get("metrics_list", [])
        logging.info("Loaded predictor artifacts from %s", _ARTIFACT_PATH)
    except Exception as e:
        logging.exception("Failed to load artifacts; will train on‐dyno when invoked: %s", e)
else:
    logging.info("No predictor_artifacts.joblib found; on‐dyno training available when called.")

# load pipelines list + data & metadata
# now returns: (List[Pipeline], df_for_inf, features, cat_opts, start_opts, dur_opts, metrics_list)


ready = all(p is not None for p in pipes) and df is not None

print(features)
for f in features:
    print(f)

# extract the full tag vocabulary from the first pipeline
pre = pipes[0].named_steps["pre"]
vectorizer = pre.named_transformers_["tags"].named_steps["vectorize"]
all_tags = vectorizer.get_feature_names_out().tolist()


# 2) User‐adjustable parameters
stream_name         = "thelegendyagami"
# stream_name         = "barbarousking"
selected_game       = "ELDEN RING"  # e.g. "Fortnite"
selected_start_time = 19                  # hour in 0–23
selected_tags       = ["Veteran", "AIArt", "English", "HardestDifficulty"]  # list of tags you want to test


# 3) Helper: build a feature‐row for a given stream/game/start/tags
def make_feature_row(baseline, game, hour, tags, features):
    r = baseline.copy()
    # 1) set categorical & time features
    r["game_category"]   = game
    r["start_time_hour"] = hour

    now_est  = datetime.now(pytz.timezone("US/Eastern"))
    dow      = now_est.strftime("%A")
    r["day_of_week"]     = dow
    r["start_hour_sin"]  = np.sin(2 * np.pi * hour / 24)
    r["start_hour_cos"]  = np.cos(2 * np.pi * hour / 24)
    # <-- FIXED here:
    r["is_weekend"]      = dow in ("Saturday", "Sunday")

    # 2) set tags one-hot
    for t in all_tags:
        r[f"tag_{t}"] = int(t in tags)

    # 3) build a single-row DataFrame and select only the model’s features
    return pd.DataFrame([r])[features]


# grab the “last row” for your stream as baseline
baseline = _get_last_row_for_stream(df, stream_name)

times = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
# game_cats = (
#     df.loc[df["stream_name"] == stream_name, "game_category"]
#       .dropna()
#       .unique()
#       .tolist()
# )
game_cats = (
    df["game_category"]
      .dropna()
      .unique()
      .tolist()
)




['day_of_week', 'start_time_hour', 'is_weekend', 'days_since_previous_stream', 'game_category', 'stream_duration', 'avg_total_subscriptions_last_1', 'avg_total_subscriptions_last_3', 'avg_total_subscriptions_last_7', 'avg_total_subscriptions_last_14', 'avg_net_follower_change_last_1', 'avg_net_follower_change_last_3', 'avg_net_follower_change_last_7', 'avg_net_follower_change_last_14', 'avg_unique_viewers_last_1', 'avg_unique_viewers_last_3', 'avg_unique_viewers_last_7', 'avg_unique_viewers_last_14', 'avg_peak_concurrent_viewers_last_1', 'avg_peak_concurrent_viewers_last_3', 'avg_peak_concurrent_viewers_last_7', 'avg_peak_concurrent_viewers_last_14', 'avg_stream_duration_last_1', 'avg_stream_duration_last_3', 'avg_stream_duration_last_7', 'avg_stream_duration_last_14', 'avg_total_num_chats_last_1', 'avg_total_num_chats_last_3', 'avg_total_num_chats_last_7', 'avg_total_num_chats_last_14', 'avg_total_emotes_used_last_1', 'avg_total_emotes_used_last_3', 'avg_total_emotes_used_last_7', 'av

In [2]:
# 4) Make predictions for each model in `pipes`
p = 0

max_pred = 0
min_pred = 100
max_game_cat = None
min_game_cat = None
total_results = []
for game in game_cats:
    results = []
    i = 0
    for idx, pipe in enumerate(pipes, start=1):
        X = make_feature_row(baseline, game, selected_start_time, selected_tags, features)
        y = pipe.predict(X)[0]
        if y > max_pred and i == p:
            max_pred = y
            max_game_cat = game
        if y < min_pred and i == p:
            min_game_cat = game
            min_pred = y
        results.append({
            "model":   f"pipe{idx}",
            "y_pred":  round(y, 2),
            "metrics": metrics_list[idx-1]      # in case you want to inspect its training metrics
        })
        total_results.append({
            "model":   f"pipe{idx}",
            "y_pred":  round(y, 2),
            "metrics": metrics_list[idx-1]      # in case you want to inspect its training metrics
        })
        i+=1

print('Max Prediction:', max_pred)
print("Game Category:", max_game_cat)
print('Min Prediction:', min_pred)
print("Game Category:", min_game_cat)
# for r in total_results:
#     print(r)
    
game = max_game_cat
max_pred = 0
min_pred = 100
max_time = None
min_time = None
for t in times:
    results = []
    i = 0
    for idx, pipe in enumerate(pipes, start=1):
        X = make_feature_row(baseline, game, t, selected_tags, features)
        y = pipe.predict(X)[0]
        if y > max_pred and i == p:
            max_pred = y
            max_time = t
        if y < min_pred and i == p:
            min_time = t
            min_pred = y
        results.append({
            "model":   f"pipe{idx}",
            "y_pred":  round(y, 2),
            "metrics": metrics_list[idx-1]      # in case you want to inspect its training metrics
        })
        i+=1

print('Max Prediction:', max_pred)
print("Time for Max Pred:", max_time)
print('Min Prediction:', min_pred)
print("Time for Min Pred:", min_time)

# game = max_game_cat
# time = max_time
# max_pred = 0
# min_pred = 100
# max_tag = None
# min_tag = None
# for t in selected_tags:
#     results = []
#     i = 0
#     for idx, pipe in enumerate(pipes, start=1):
#         X = make_feature_row(baseline, game, time, t, features)
#         y = pipe.predict(X)[0]
#         if y > max_pred and i == p:
#             max_pred = y
#             max_tag = t
#         if y < min_pred and i == p:
#             min_tag = t
#             min_pred = y
#         results.append({
#             "model":   f"pipe{idx}",
#             "y_pred":  round(y, 2),
#             "metrics": metrics_list[idx-1]      # in case you want to inspect its training metrics
#         })
#         i+=1

# print('Max Prediction:', max_pred)
# print("Tag for Max Pred:", max_tag)
# print('Min Prediction:', min_pred)
# print("Tag for Min Pred:", min_tag)

Max Prediction: 1.0195401797757997
Game Category: streamer games
Min Prediction: 0.5242553426377153
Game Category: super mario world
Max Prediction: 1.1375001118377308
Time for Max Pred: 1
Min Prediction: 0.8555774988377397
Time for Min Pred: 23


In [7]:
def _robust_feature_names(pre: ColumnTransformer, X_in: pd.DataFrame) -> np.ndarray:
    """
    Build output names per transformer block, with a special case for the 'tags'
    pipeline that uses CountVectorizer's vocabulary. Falls back to block-scoped
    numbered names instead of f###.
    """
    if pre is None:
        return np.asarray(list(map(str, X_in.columns)))

    names = []
    used_input = set()

    def _cols_to_list(cols):
        if isinstance(cols, slice):
            return list(X_in.columns[cols])
        if isinstance(cols, (list, tuple, np.ndarray, pd.Index)):
            if len(cols) and isinstance(cols[0], (int, np.integer)):
                return [X_in.columns[i] for i in cols]
            if len(cols) and isinstance(cols[0], (bool, np.bool_)):
                return list(X_in.columns[np.array(cols)])
            return list(map(str, cols))
        return [str(cols)]

    for block_name, trans, cols in pre.transformers_:
        if block_name == "remainder":
            continue
        in_cols = _cols_to_list(cols)
        used_input.update(in_cols)

        if trans == "drop":
            continue

        # Unwrap pipeline if present
        last = trans.steps[-1][1] if isinstance(trans, Pipeline) else trans

        # --- Special case: tags pipeline -> use CountVectorizer tokens ---
        if block_name == "tags":
            try:
                vec = trans.named_steps.get("vectorize") if isinstance(trans, Pipeline) else None
                if vec is not None and hasattr(vec, "get_feature_names_out"):
                    toks = vec.get_feature_names_out()
                    out = [f"tag::{t}" for t in toks]           # nice readable prefix
                    names.extend(out)
                    continue
            except Exception:
                pass  # fall through to generic handling

        # Prefer transformer-provided names if available
        out = None
        try:
            if hasattr(last, "get_feature_names_out"):
                try:
                    out = last.get_feature_names_out(in_cols)
                except Exception:
                    out = last.get_feature_names_out()
        except Exception:
            out = None

        if out is None:
            # Fallback: infer width on a single-row transform
            try:
                one = X_in[in_cols].iloc[[0]]
                W = trans.transform(one)
                width = W.shape[1]
            except Exception:
                width = len(in_cols)
            if width == len(in_cols):
                out = [f"{block_name}__{c}" for c in in_cols]
            else:
                out = [f"{block_name}__{i}" for i in range(width)]

        names.extend(map(str, out))

    # Remainder passthrough
    if getattr(pre, "remainder", "drop") == "passthrough":
        rem = [c for c in X_in.columns if c not in used_input]
        names.extend([f"remainder__{c}" for c in rem])

    return np.asarray(names)


In [8]:
def _fix_tag_names(pre: ColumnTransformer, X_in: pd.DataFrame, feat_names: np.ndarray) -> np.ndarray:
    """
    Replace the generic names for the 'tags' block with CountVectorizer tokens.
    Works for your pipeline: ('tags', Pipeline([('join', ...), ('vectorize', CountVectorizer), ...]), ['raw_tags'])
    If you later add SVD, it will label as tag_svd_<i>.
    """
    if pre is None:
        return feat_names

    # Build a 1-row frame for width probing
    one = X_in.iloc[[0]]

    out = list(feat_names)
    cursor = 0
    for block_name, trans, cols in pre.transformers_:
        if block_name == "remainder" or trans == "drop":
            continue

        # figure out the width this block contributes
        # handle column selection types
        if isinstance(cols, slice):
            sub = one.iloc[:, cols]
        elif isinstance(cols, (list, tuple, np.ndarray, pd.Index)):
            if len(cols) and isinstance(cols[0], (int, np.integer)):
                sub = one.iloc[:, cols]
            elif len(cols) and isinstance(cols[0], (bool, np.bool_)):
                sub = one.loc[:, np.array(cols)]
            else:
                sub = one.loc[:, list(cols)]
        else:
            sub = one[[str(cols)]]

        try:
            W = trans.transform(sub)
            width = W.shape[1]
        except Exception:
            # fall back if transform fails on single row
            width = 0

        start, end = cursor, cursor + width

        if block_name == "tags":
            try:
                pipe = trans if isinstance(trans, Pipeline) else None
                vec  = pipe.named_steps.get("vectorize") if pipe else None
                if vec is not None and hasattr(vec, "get_feature_names_out"):
                    toks = vec.get_feature_names_out()
                    if "svd" in pipe.named_steps:
                        # Dimensionality reduction present: we no longer have per-token columns
                        out[start:end] = [f"tag_svd_{i}" for i in range(width)]
                    else:
                        # No SVD: one column per token
                        if len(toks) == width:
                            out[start:end] = [f"tag::{t}" for t in toks]
                        else:
                            # mismatch fallback
                            out[start:end] = [f"tag::{i}" for i in range(width)]
                else:
                    out[start:end] = [f"tag::{i}" for i in range(width)]
            except Exception:
                out[start:end] = [f"tag::{i}" for i in range(width)]

        cursor = end

    return np.asarray(out)


In [9]:


def shap_effect_plots(
    pipe: Pipeline,
    X_df: pd.DataFrame,
    nsample_background: int = 512,
    nsample_explain: int = 2000,
    random_state: int = 42,
    model_name: str = None,
    top_categories: int = 25,
):
    """
    Dependence-style SHAP charts focused on:
      1) start_time_hour   (scatter: value vs SHAP)
      2) stream_duration   (scatter: value vs SHAP)
      3) game_category     (bar: mean SHAP of the active OHE category)
    Shows figures immediately; returns a dict of Plotly figs.
    """
    rng = np.random.default_rng(random_state)
    ttr, tree_est = _get_ttr_and_tree_estimator(pipe)
    pre = _get_preprocessor(pipe)

    # Make sure columns align with training
    X_in = X_df.copy()
    if hasattr(pipe, "feature_names_in_"):
        X_in = X_in[pipe.feature_names_in_]

    # Transform like training
    if pre is not None:
        X_trans = pre.transform(X_in)
        # feat_names = _safe_feature_names(pre, X_in.columns)
        # feat_names = _fix_tag_names(pre, X_in, feat_names)
        feat_names = _robust_feature_names(pre, X_in)
    else:
        X_trans = X_in.values
        feat_names = np.array(list(map(str, X_in.columns)))

    # Dense float for SHAP
    if hasattr(X_trans, "toarray"):
        X_trans = X_trans.toarray()
    X_trans = np.asarray(X_trans, dtype=np.float32)

    # Subsample to keep plots snappy
    n = X_trans.shape[0]
    bg_idx = rng.choice(n, size=min(nsample_background, n), replace=False)
    ex_idx = rng.choice(n, size=min(nsample_explain, n), replace=False)
    X_bg, X_ex = X_trans[bg_idx], X_trans[ex_idx]
    X_ex_df = X_in.iloc[ex_idx].copy()  # original feature values for x-axes

    # SHAP explain (disable additivity check to avoid HGB quirks)
    masker = shap.maskers.Independent(X_bg)
    explainer = shap.Explainer(tree_est, masker, feature_names=feat_names, algorithm="tree")
    explanation = explainer(X_ex, check_additivity=False)
    vals = np.asarray(explanation.values)  # [n_samples, n_features]

    # --- helper to find the transformed-column index for a raw feature name
    def find_idx(name=None):
        # match exact, or any suffix after a prefix like "pre__num__"
        candidates = [i for i, fn in enumerate(feat_names)
                      if fn == name or fn.endswith(f"__{name}") or fn.endswith(f"_{name}") or fn == f"{name}"]
        return candidates[0] if candidates else None

    figs = {}

    # 1) start_time_hour dependence (scatter + per-hour mean line)
    start_idx = find_idx("start_time_hour")
    if start_idx is not None and "start_time_hour" in X_ex_df:
        x = X_ex_df["start_time_hour"].astype(float)
        y = vals[:, start_idx]

        dep_df = pd.DataFrame({"start_time_hour": x, "shap": y})
        # per-hour mean (0..23) line
        per_hour = dep_df.groupby(dep_df["start_time_hour"].round().clip(0, 23))["shap"].mean().reset_index()
        per_hour = per_hour.rename(columns={"shap": "shap_mean", "start_time_hour": "hour"})

        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=dep_df["start_time_hour"], y=dep_df["shap"], mode="markers", opacity=0.35,
            name="samples"
        ))
        fig.add_trace(go.Scatter(
            x=per_hour["hour"], y=per_hour["shap_mean"], mode="lines+markers",
            name="mean by hour"
        ))
        fig.update_layout(
            title=(f"{model_name} — " if model_name else "") + "SHAP Dependence: start_time_hour",
            xaxis_title="Start hour (0–23)",
            yaxis_title="SHAP value (impact on prediction)",
            template="simple_white"
        )
        fig.show()
        figs["start_time_hour"] = fig

    # 2) stream_duration dependence (minutes → hours for readability)
    dur_idx = find_idx("stream_duration")
    if dur_idx is not None and "stream_duration" in X_ex_df:
        x_min = X_ex_df["stream_duration"].astype(float)
        x_hr = x_min / 60.0
        y = vals[:, dur_idx]

        dep_df = pd.DataFrame({"duration_hr": x_hr, "shap": y})
        # bin means (e.g., 0.5h bins)
        bins = np.linspace(dep_df["duration_hr"].min(), dep_df["duration_hr"].max(), 16)
        dep_df["bin"] = np.digitize(dep_df["duration_hr"], bins)
        per_bin = dep_df.groupby("bin").agg(x=("duration_hr", "mean"), shap_mean=("shap", "mean")).reset_index(drop=True)

        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=dep_df["duration_hr"], y=dep_df["shap"], mode="markers", opacity=0.35,
            name="samples"
        ))
        fig.add_trace(go.Scatter(
            x=per_bin["x"], y=per_bin["shap_mean"], mode="lines+markers",
            name="mean by duration bin"
        ))
        fig.update_layout(
            title=(f"{model_name} — " if model_name else "") + "SHAP Dependence: stream_duration",
            xaxis_title="Duration (hours)",
            yaxis_title="SHAP value (impact on prediction)",
            template="simple_white"
        )
        fig.show()
        figs["stream_duration"] = fig

    # 3) game_category effect bar:
    #    For each sample, take the SHAP value of the ACTIVE one-hot column, then average by category.
    #    Identify OHE columns for game_category
    # 3) game_category effect bar (handles OHE and OrdinalEncoder)
    name = "game_category"

    # Detect one-hot encoded columns for game_category
    game_ohe_cols = [
        i for i, fn in enumerate(feat_names)
        if re.search(rf"(?:^|__){re.escape(name)}(?:[_=:.])", str(fn))  # e.g., 'game_category_DOTA 2', 'cat__game_category_Apex'
        or str(fn).startswith(f"{name}_")
    ]

    if game_ohe_cols:
        # --- OHE path: take SHAP of the ACTIVE column per row, then average by category ---
        X_game = X_ex[:, game_ohe_cols]
        vals_game = vals[:, game_ohe_cols]
        active_idx = X_game.argmax(axis=1)
        all_zero = (X_game.max(axis=1) == 0)
        active_idx[all_zero] = -1  # unknown/dropped

        cats, cat_shaps = [], []
        for r in range(X_game.shape[0]):
            j = active_idx[r]
            if j >= 0:
                global_j = game_ohe_cols[j]
                cats.append(str(feat_names[global_j]))
                cat_shaps.append(vals[r, global_j])

        if cats:
            cat_df = pd.DataFrame({"feature": cats, "shap": cat_shaps})
            cat_df["category"] = (
                cat_df["feature"]
                .str.replace(r".*game_category[_=:]*", "", regex=True)
                .replace("", "game_category")
            )
            agg = cat_df.groupby("category")["shap"].mean().sort_values(ascending=False)
            top = agg.head(top_categories).sort_values(ascending=True)

            fig = px.bar(
                top.reset_index(),
                x="shap", y="category", orientation="h",
                title=(f"{model_name} — " if model_name else "") + "Game Category Effect (mean SHAP of active OHE category)"
            )
            fig.update_yaxes(categoryorder="total descending")
            fig.update_layout(xaxis_title="Mean SHAP (category effect on prediction)", template="simple_white")
            fig.show()
            figs["game_category"] = fig

    else:
        # --- Ordinal path: single encoded column; aggregate its SHAP by original category label ---
        # Find the encoded feature column in the transformed matrix
        game_idx = next(
            (i for i, fn in enumerate(feat_names)
            if str(fn) == name or str(fn).endswith(f"__{name}")),
            None
        )
        if game_idx is not None and name in X_ex_df.columns:
            labels = X_ex_df[name].astype(str).replace({"-1": "unknown", "-1.0": "unknown"})  # handle unknown_value=-1
            shap_vals = vals[:, game_idx]
            agg = (
                pd.DataFrame({"category": labels, "shap": shap_vals})
                .groupby("category")["shap"].mean()
                .sort_values(ascending=True)
            )
            top = agg.tail(top_categories)  # keep top K by magnitude (positive end)
            fig = px.bar(
                top.reset_index(),
                x="shap", y="category", orientation="h",
                title=(f"{model_name} — " if model_name else "") + "Game Category Effect (mean SHAP by category)"
            )
            fig.update_yaxes(categoryorder="total descending")
            fig.update_layout(xaxis_title="Mean SHAP (category effect on prediction)", template="simple_white")
            fig.show()
            figs["game_category"] = fig
        else:
            print("No game_category feature found (neither OHE nor ordinal).")


    return figs


In [10]:



def _get_ttr_and_tree_estimator(pipe: Pipeline):
    """
    Return (ttr, tree_est) where ttr is the TransformedTargetRegressor step
    and tree_est is the fitted underlying HistGradientBoostingRegressor.
    Works whether the TTR step is named 'model'/'reg' or something else.
    """
    ttr = None
    if isinstance(pipe, Pipeline):
        for name, step in pipe.named_steps.items():
            # The TTR is usually the final step
            from sklearn.compose import TransformedTargetRegressor
            if isinstance(step, TransformedTargetRegressor):
                ttr = step
                break
    # If TTR is not directly in steps, allow passing the estimator itself
    if ttr is None and hasattr(pipe, "regressor_"):  # already a fitted TTR
        ttr = pipe

    if ttr is None:
        raise RuntimeError("Could not find TransformedTargetRegressor in pipeline.")

    # When fitted, TTR exposes .regressor_
    tree_est = getattr(ttr, "regressor_", getattr(ttr, "regressor", None))
    if tree_est is None:
        raise RuntimeError("Could not access underlying regressor from TTR.")

    return ttr, tree_est

def _get_preprocessor(pipe: Pipeline):
    """Return the fitted ColumnTransformer (if present) or None."""
    if isinstance(pipe, Pipeline):
        return pipe.named_steps.get("pre", None)
    return None

def _safe_feature_names(pre: ColumnTransformer, X_cols):
    """
    Try to get output feature names from the preprocessor; fall back gracefully.
    """
    names = None
    if pre is not None and hasattr(pre, "get_feature_names_out"):
        try:
            # Some sklearn versions accept input feature names for alignment:
            names = pre.get_feature_names_out(X_cols)
        except Exception:
            try:
                names = pre.get_feature_names_out()
            except Exception:
                names = None
    if names is None:
        # Fall back: use input column names if they match; else generic f0..fN
        n_out = None
        try:
            # Probe shape by transforming a small slice
            n_out = pre.transform(pd.DataFrame({c: [0] for c in X_cols})).shape[1] if pre is not None else len(X_cols)
        except Exception:
            pass
        if n_out is not None:
            if len(X_cols) == n_out:
                names = np.array(list(map(str, X_cols)))
            else:
                names = np.array([f"f{i}" for i in range(n_out)])
        else:
            names = np.array(list(map(str, X_cols)))
    return np.array(names)

def shap_global_importance_for_pipeline(
    pipe: Pipeline,
    X_df: pd.DataFrame,
    top_n: int = 30,
    nsample_background: int = 512,
    nsample_explain: int = 2000,
    random_state: int = 42,
    title: str = None,
    save_path: str = None,
):
    """
    Compute & plot global SHAP feature importances for a *fitted* pipeline:
      [preprocessor 'pre'] -> [TransformedTargetRegressor(HistGradientBoostingRegressor)]
    Returns a DataFrame of mean(|SHAP|) per feature (sorted desc).
    """
    rng = np.random.default_rng(random_state)

    # 1) Extract fitted pieces
    ttr, tree_est = _get_ttr_and_tree_estimator(pipe)
    pre = _get_preprocessor(pipe)

    # 2) Transform X through the preprocessor the same way as training
    X_in = X_df.copy()
    # make sure column order matches what pipeline expects
    if hasattr(pipe, "feature_names_in_"):
        X_in = X_in[pipe.feature_names_in_]

    if pre is not None:
        X_trans = pre.transform(X_in)
        # feat_names = _safe_feature_names(pre, X_in.columns)
        # feat_names = _fix_tag_names(pre, X_in, feat_names)
        feat_names = _robust_feature_names(pre, X_in)
    else:
        # No preprocessor; assume numeric array already
        X_trans = X_in.values
        feat_names = np.array(list(map(str, X_in.columns)))

    # SHAP prefers dense float; ensure type/shape
    if hasattr(X_trans, "toarray"):
        X_trans = X_trans.toarray()
    X_trans = np.asarray(X_trans, dtype=np.float32)

    # 3) Subsample for background (masker) and explanation for speed
    n = X_trans.shape[0]
    bg_idx = rng.choice(n, size=min(nsample_background, n), replace=False)
    ex_idx = rng.choice(n, size=min(nsample_explain, n), replace=False)
    X_bg = X_trans[bg_idx]
    X_ex = X_trans[ex_idx]

    # 4) Build SHAP explainer (tree algorithm auto-detected)
    masker = shap.maskers.Independent(X_bg)
    explainer = shap.Explainer(tree_est, masker, feature_names=feat_names, algorithm="tree")
    explanation = explainer(X_ex, check_additivity=False)
    # feat_names = np.array(explanation.feature_names)

    # 5) Compute mean(|SHAP|) per feature (global importance)
    vals = np.asarray(explanation.values)          # [n_samples, n_features]
    n_feats = vals.shape[1]
    # Force feature names to align with SHAP output width
    ef = getattr(explanation, "feature_names", None)
    if len(feat_names) != n_feats and ef is not None and len(ef) == n_feats:
        feat_names = np.asarray(ef)
    elif len(feat_names) > n_feats:
        feat_names = np.asarray(feat_names[:n_feats])
    elif len(feat_names) < n_feats:
        feat_names = np.asarray(list(feat_names) + [f"f{i}" for i in range(len(feat_names), n_feats)])
    else:
        feat_names = np.asarray(feat_names)
    # feat_names = getattr(explanation, "feature_names", None)
    if feat_names is None:
        feat_names = [f"f{i}" for i in range(n_feats)]
    feat_names = np.asarray(feat_names)
    if feat_names.shape[0] != n_feats:
        feat_names = np.array([f"f{i}" for i in range(n_feats)])
        
    if len(feat_names) != n_feats:
        # fallback to generic names to guarantee equal lengths
        feat_names = np.array([f"f{i}" for i in range(n_feats)])

    mean_abs = np.mean(np.abs(vals), axis=0)
    imp_df = (
        pd.DataFrame({"feature": feat_names, "mean_abs_shap": mean_abs})
        .sort_values("mean_abs_shap", ascending=False)
        .reset_index(drop=True)
    )
    

    # 6) Plot top-N as a bar (each chart in its own figure)
    max_display = int(min(top_n, len(feat_names)))
    order = np.argsort(mean_abs)[::-1][:max_display]  # top N, descending

    fnames = feat_names[order]
    df_bar = pd.DataFrame({"feature": fnames, "mean_abs": mean_abs[order]})

    bar_fig = px.bar(
        df_bar.sort_values("mean_abs", ascending=True),
        x="mean_abs",
        y="feature",
        orientation="h",
        title=title or "Global Feature Importance"
    )
    bar_fig.update_yaxes(categoryorder="total descending")  # largest at top
    bar_fig.show()  # or remove if you don't want it to auto-render


    return imp_df

# ----------------- usage with your loaded artifacts -----------------
# Assumes you already loaded:
# - pipes: List[Pipeline]  (fitted)
# - df_inf: DataFrame used for inference
# - features: List[str]    (feature columns expected by the pipelines)
# - metrics_list: e.g. ["subs", "viewers", "followers"] (optional)

if 'pipes' in globals() and 'df_inf' in globals() and 'features' in globals():
    metric_names = metrics_list if 'metrics_list' in globals() and metrics_list else [
        f"model_{i}" for i in range(len(pipes))
    ]
    metric_names = [n if isinstance(n, str) else f"model_{i}" for i, n in enumerate(metric_names)]


    all_importances = []
    for pipe, mname in zip(pipes, metric_names):
        label = mname
        X = df_inf[features].copy()
        imp_df = shap_global_importance_for_pipeline(
            pipe=pipe,
            X_df=X,
            top_n=30,                  # change to taste
            nsample_background=512,    # speed/accuracy tradeoff
            nsample_explain=2000,      # more -> smoother estimates
            random_state=42,
            title=f"Global SHAP Importance — {label}",
            save_path=None             # e.g. f"shap_{mname}.png"
        )
        # plot_focus_from_imp_df(imp_df, model_name=mname, top_cat=20)
        
        _ = shap_effect_plots(pipe, X, model_name=mname, top_categories=25)
        
        imp_df["model"] = mname
        all_importances.append(imp_df)


    # Combined table of importances across models
    if all_importances:
        comb = pd.concat(all_importances, ignore_index=True)
        # average rank across models to find features important everywhere
        comb["rank"] = comb.groupby("model")["mean_abs_shap"].rank(ascending=False, method="min")
        cross_model = (
            comb.groupby("feature")
                .agg(
                    mean_rank=("rank", "mean"),
                    mean_importance=("mean_abs_shap", "mean"),
                    models_seen=("model", "nunique"),
                )
                .sort_values(["models_seen", "mean_rank", "mean_importance"], ascending=[False, True, False])
        )
        # Show top 50 consensus features
        print("\nTop cross-model features (consensus):")
        print(cross_model.head(50))


Top cross-model features (consensus):
                                     mean_rank  mean_importance  models_seen
feature                                                                     
stream_duration                       4.000000         0.234647            3
avg_peak_concurrent_viewers_last_1   10.666667         0.165091            3
avg_peak_concurrent_viewers_last_7   12.000000         0.115428            3
avg_chats_per_hour_last_3            15.666667         0.040652            3
avg_unique_viewers_last_7            17.000000         0.051829            3
avg_subs_per_hour_last_14            17.333333         0.071575            3
avg_chats_per_viewer_last_3          18.000000         0.060203            3
avg_chats_per_viewer_last_7          18.000000         0.043903            3
avg_peak_concurrent_viewers_last_3   20.333333         0.286809            3
game_category                        21.000000         0.035958            3
avg_chats_per_viewer_last_14         

In [7]:
def plot_three_factor_overlap(
    pipe: Pipeline,
    X_df: pd.DataFrame,
    nsample_background: int = 512,
    nsample_explain: int = 2000,
    random_state: int = 42,
    top_categories: int = 6,
    model_name: str = None,
):
    rng = np.random.default_rng(random_state)
    # ---- pieces
    ttr = None
    from sklearn.compose import TransformedTargetRegressor
    for _, step in pipe.named_steps.items():
        if isinstance(step, TransformedTargetRegressor):
            ttr = step; break
    reg = getattr(ttr, "regressor_", getattr(ttr, "regressor", None))
    pre: ColumnTransformer = pipe.named_steps.get("pre")

    # ---- align inputs like training
    X_in = X_df.copy()
    if hasattr(pipe, "feature_names_in_"):
        X_in = X_in[pipe.feature_names_in_]

    if pre is not None:
        X_trans = pre.transform(X_in)
        feat_names = _robust_feature_names(pre, X_in)
    else:
        X_trans = X_in.values
        feat_names = np.array(list(map(str, X_in.columns)))

    if hasattr(X_trans, "toarray"):
        X_trans = X_trans.toarray()
    X_trans = np.asarray(X_trans, dtype=np.float32)

    # subsample for speed
    n = X_trans.shape[0]
    bg_idx = rng.choice(n, size=min(nsample_background, n), replace=False)
    ex_idx = rng.choice(n, size=min(nsample_explain, n), replace=False)
    X_bg, X_ex = X_trans[bg_idx], X_trans[ex_idx]
    X_ex_df = X_in.iloc[ex_idx].copy()

    # ---- SHAP
    masker = shap.maskers.Independent(X_bg)
    explainer = shap.Explainer(reg, masker, feature_names=feat_names, algorithm="tree")
    explanation = explainer(X_ex, check_additivity=False)
    vals = np.asarray(explanation.values)  # [n_samples, n_features]

    # ---- helpers
    def find_idx(name=None):
        for i, fn in enumerate(feat_names):
            s = str(fn)
            if s == name or s.endswith(f"__{name}"):
                return i
        return None

    # hour + duration SHAP
    start_idx = find_idx("start_time_hour")
    dur_idx   = find_idx("stream_duration")

    if start_idx is None or dur_idx is None or "start_time_hour" not in X_ex_df or "stream_duration" not in X_ex_df:
        print("Missing start_time_hour or stream_duration in this model. Retrain with hour passthrough to enable this plot.")
        return None

    start_hour = X_ex_df["start_time_hour"].astype(float)
    duration_hr = X_ex_df["stream_duration"].astype(float) / 60.0
    shap_hour = vals[:, start_idx]
    shap_dur  = vals[:, dur_idx]

    # game category SHAP (handles both OHE and OrdinalEncoder)
    name = "game_category"
    # OHE columns (e.g., 'cat__game_category_Apex Legends' or 'game_category_Apex Legends')
    game_ohe_cols = [
        i for i, fn in enumerate(feat_names)
        if re.search(rf"(?:^|__){re.escape(name)}(?:[_=:.])", str(fn)) or str(fn).startswith(f"{name}_")
    ]
    if game_ohe_cols:
        X_game = X_ex[:, game_ohe_cols]
        active_idx = X_game.argmax(axis=1)
        all_zero = (X_game.max(axis=1) == 0)
        active_idx[all_zero] = -1

        cat_labels = []
        shap_game = np.zeros(X_ex.shape[0], dtype=float)
        for r in range(X_game.shape[0]):
            j = active_idx[r]
            if j >= 0:
                gj = game_ohe_cols[j]
                raw = str(feat_names[gj])
                # clean: suffix after 'game_category'
                lab = re.sub(r".*game_category[_=:]*", "", raw).strip() or "unknown"
                cat_labels.append(lab)
                shap_game[r] = vals[r, gj]
            else:
                cat_labels.append("unknown")
                shap_game[r] = 0.0
    else:
        # Ordinal path: use original string labels + single feature SHAP
        game_idx = next((i for i, fn in enumerate(feat_names)
                         if str(fn) == name or str(fn).endswith(f"__{name}")), None)
        if game_idx is None or name not in X_ex_df:
            print("No game_category feature found (neither OHE nor ordinal).")
            return None
        shap_game = vals[:, game_idx]
        cat_labels = X_ex_df[name].astype(str).replace({"-1": "unknown", "-1.0": "unknown"}).tolist()

    # ---- assemble plot frame
    dfp = pd.DataFrame({
        "start_time_hour": start_hour,
        "duration_hr": duration_hr,
        "category": cat_labels,
        "shap_hour": shap_hour,
        "shap_duration": shap_dur,
        "shap_game": shap_game,
    })
    dfp["combined"] = dfp["shap_hour"] + dfp["shap_duration"] + dfp["shap_game"]
    dfp["abs_combined"] = dfp["combined"].abs()

    # bucket categories: top by frequency in this sample
    top = dfp["category"].value_counts().head(top_categories).index.tolist()
    dfp["category_simple"] = np.where(dfp["category"].isin(top), dfp["category"], "other")

    # scale sizes (nice range)
    s_ref = dfp["abs_combined"].quantile(0.9) if (dfp["abs_combined"] > 0).any() else 1.0
    dfp["size"] = (dfp["abs_combined"] / (s_ref + 1e-9) * 20).clip(6, 26)

    # ---- one final overlapping plot
    title = (f"{model_name} — " if model_name else "") + "Overlap of Effects (start hour × duration, colored by net SHAP)"
    fig = px.scatter(
        dfp,
        x="start_time_hour", y="duration_hr",
        color="combined", color_continuous_scale="RdBu", color_continuous_midpoint=0,
        size="size", size_max=16,
        symbol="category_simple",
        hover_data={
            "category": True,
            "shap_hour": ':.3f',
            "shap_duration": ':.3f',
            "shap_game": ':.3f',
            "combined": ':.3f',
            "start_time_hour": True,
            "duration_hr": ':.2f'
        },
        title=title,
        height=600,
        )
    fig.update_layout(
        xaxis_title="Start hour (0–23)",
        yaxis_title="Duration (hours)",
        template="simple_white",
        margin=dict(l=80, r=30, t=60, b=180),
        legend=dict(
            orientation="h",
            yanchor="top", y=-0.20,
            xanchor="left", x=0,
            font=dict(size=11)
        )
    )
    fig.show()
    return fig

In [8]:
for pipe, mname in zip(pipes, metric_names):
    X = df_inf[features].copy()
    # your existing global bar (optional)
    # imp_df = shap_global_importance_for_pipeline(pipe=pipe, X_df=X, ...)

    # NEW: single decision plot
    _ = plot_three_factor_overlap(pipe, X, model_name=mname, top_categories=6)

In [None]:
from itertools import chain

def best_tag_combinations(
    pipe,
    baseline,
    game,
    hour,
    features,
    candidate_tags,
    max_tags=None
):
    """
    Greedily build up a tag set one tag at a time,
    always picking the tag that gives the largest bump
    in pipe.predict. Returns a list of (tag_tuple, score).
    """
    # start from no tags
    selected    = []
    # baseline prediction with zero tags
    X0          = make_feature_row(baseline, game, hour, [], features)
    best_score  = pipe.predict(X0)[0]
    history     = [(tuple(selected), best_score)]
    remaining   = set(candidate_tags)

    while remaining and (max_tags is None or len(selected) < max_tags):
        # try adding each remaining tag
        scores = {}
        for t in remaining:
            tags_try = selected + [t]
            X_try    = make_feature_row(baseline, game, hour, tags_try, features)
            scores[t] = pipe.predict(X_try)[0]

        # pick the tag with the highest resulting score
        best_tag, score = max(scores.items(), key=lambda kv: kv[1])

        # stop if nothing improves
        if score <= best_score:
            break

        # otherwise record and continue
        selected.append(best_tag)
        remaining.remove(best_tag)
        best_score = score
        history.append((tuple(selected), best_score))

    return history




In [None]:
import itertools
import pandas as pd

stream_name         = "thelegendyagami"

# grab the “last row” for your stream as baseline
baseline = _get_last_row_for_stream(df, stream_name)

# 1) Legend’s games (only from their history, no preds needed)
legend_games = df.loc[
    df["stream_name"] == stream_name,
    "game_category"
].unique().tolist()

# 2) Legend’s tags
legend_tags = sorted({
    tag
    for tags in df.loc[df["stream_name"] == stream_name, "raw_tags"].dropna()
    for tag in tags
})


# 2) Build a **restricted** combo‐grid
grid = pd.DataFrame(
    list(itertools.product(legend_games, start_opts, dur_opts)),
    columns=["game_category", "start_time_hour", "stream_duration"]
)


# 3) Predict all three metrics for each combo
def predict_all_metrics(row):
    X = make_feature_row(
        baseline,
        row.game_category,
        row.start_time_hour,
        selected_tags,
        features
    )
    return pd.Series({
        "subs":      round(pipes[0].predict(X)[0], 2),
        "followers": round(pipes[1].predict(X)[0], 2),
        "viewers":   round(pipes[2].predict(X)[0], 2),
    })

df_metrics = pd.concat(
    [grid, grid.apply(predict_all_metrics, axis=1)],
    axis=1
)


# 4) Top-3 combos by each metric
top3_subs      = df_metrics.nlargest(3, "subs")
top3_followers = df_metrics.nlargest(3, "followers")
top3_viewers   = df_metrics.nlargest(3, "viewers")


print("🔸 Top 3 game/time/duration for MAX subs 🔸")
display(top3_subs)

print("🔸 Top 3 game/time/duration for MAX follower growth 🔸")
display(top3_followers)

print("🔸 Top 3 game/time/duration for MAX viewers 🔸")
display(top3_viewers)


# 5) Top-3 single-tag suggestions, **restricted** to legend_tags

print("legend_tags:", legend_tags)

combo_histories = []
for idx, pipe in enumerate(pipes, start=1):
    hist = best_tag_combinations(
        pipe,
        baseline,
        selected_game,
        selected_start_time,
        features,
        legend_tags,
        max_tags=len(legend_tags)  # or cap at some smaller number if you like
    )
    combo_histories.append(hist)

    # grab the top 3 combos by predicted value
    top3 = sorted(hist, key=lambda x: x[1], reverse=True)[:3]
    print(f"\n🔸 Model {idx} top‐3 tag combos (score) 🔸")
    for combo, score in top3:
        print(f"  {combo} → {score:.2f}")



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

y_true = y_test
y_pred = model.best_estimator_.predict(X_test)

plt.scatter(y_true, y_pred, alpha=0.3)
plt.plot([y_true.min(), y_true.max()],
        [y_true.min(), y_true.max()],
        'k--', lw=2)
plt.xlabel("Actual subscriptions")
plt.ylabel("Predicted subscriptions")
plt.title(f"R² = {r2_score(y_true, y_pred):.2f}")
plt.show()