In [4]:
# NFL Game Outcome Interaction Model with Rolling Window Training and Voila Visualization

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import HTML, Javascript
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from ipywidgets import widgets, VBox, HBox, Dropdown, Label, Output, Layout
from IPython.display import display, clear_output
from sklearn.metrics.pairwise import cosine_similarity
import shap
import plotly.express as px

In [5]:
# ---------------------------
# Helper Functions
# ---------------------------
def get_previous_week(current_week, all_weeks_sorted):
    season = current_week // 100
    week = current_week % 100
    if week == 1:
        prev_week = (season - 1) * 100 + 18
    else:
        prev_week = season * 100 + (week - 1)
    return prev_week if prev_week in all_weeks_sorted else None

def build_interaction_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [6]:
# ---------------------------
# Load Schedule and Embeddings
# ---------------------------
gru_embedding_df = pd.read_csv('gru_team_embeddings_by_week.csv')
mlp_embedding_df = pd.read_csv('mlp_2023_2024_embeddings.csv')
mlp_embedding_df.set_index('team', inplace=False)

# Embedding column separation
gru_embedding_cols = [col for col in gru_embedding_df.columns if col.startswith('emb_')]
mlp_embedding_cols = [col for col in mlp_embedding_df.columns if col.startswith('mlp_emb_')]

# Week list
all_weeks = sorted(gru_embedding_df['seasonweek'].unique())
upcoming_weeks = [202501, 202502, 202503]
full_week_list = sorted(set(all_weeks + upcoming_weeks))

# Filter weeks to start from 202401 (your request)
week_choices = [w for w in full_week_list if w >= 202401]
if not week_choices:
     week_choices = full_week_list[:]  # fallback to everything if filter empty

In [7]:
# =========================
# Load Trained Model
# =========================
interaction_model = load_model('interaction_model_rolling_window.keras')

In [8]:
# =========================
# Similarity Function
# =========================

raw_cols = pd.read_csv("final_hist_data.csv").columns
gru_feature_names = [c for c in raw_cols if c not in ("Seasonweek", "Result", "Team")]

def build_feature_vector(team_a, team_b, week,
                         mlp_df, gru_df, gru_cols, mlp_cols, all_weeks_sorted):
    """
    Returns: X (1, n_features), feature_names (list[str])
    Layout = [A_GRU..., A_MLP..., B_GRU..., B_MLP...]
    """
    prev_week = get_previous_week(week, all_weeks_sorted)
    if prev_week is None:
        raise ValueError(f"No previous week exists for {week} in the available weeks list.")

    # GRU @ prev_week
    t1_gru = gru_df.loc[(gru_df['team'] == team_a) & (gru_df['seasonweek'] == prev_week), gru_cols].values.flatten()
    t2_gru = gru_df.loc[(gru_df['team'] == team_b) & (gru_df['seasonweek'] == prev_week), gru_cols].values.flatten()
    if t1_gru.size == 0 or t2_gru.size == 0:
        raise ValueError(f"Missing GRU embedding at prev_week={prev_week} for {team_a} or {team_b}")

    # MLP (static)
    t1_mlp = mlp_df.loc[mlp_df['team'] == team_a, mlp_cols].values.flatten()
    t2_mlp = mlp_df.loc[mlp_df['team'] == team_b, mlp_cols].values.flatten()
    if t1_mlp.size == 0 or t2_mlp.size == 0:
        raise ValueError(f"Missing MLP embedding for {team_a} or {team_b}")

    x = np.concatenate([t1_gru, t1_mlp, t2_gru, t2_mlp]).astype(np.float32).reshape(1, -1)

    # Keep GRU feature names explicit; collapse MLPs into one generic label
    a_gru_names = [f"Away_GRU_{c}" for c in gru_feature_names]
    a_mlp_names = ["Away_MLP_LatentOffseasonChange"] * len(mlp_cols)
    b_gru_names = [f"Home_GRU_{c}" for c in gru_feature_names]
    b_mlp_names = ["Home_MLP_LatentOffseasonChange"] * len(mlp_cols)

    fnames = a_gru_names + a_mlp_names + b_gru_names + b_mlp_names
    return x, fnames

In [9]:
# ---------------------------
# Prediction & Comparison
# ---------------------------

def predict_game(team_a, team_b, week, model, mlp_df, gru_df, gru_cols, mlp_cols):
    """
    Returns (message_str, prob_float or None).
    Prints away-team win probability and predicted winner.
    """
    try:
        x, _ = build_feature_vector(team_a, team_b, week, mlp_df, gru_df, gru_cols, mlp_cols, all_weeks)
    except Exception as e:
        return f"❌ Embedding error: {e}", None

    try:
        prob = float(model.predict(x, verbose=0)[0][0])
        winner = team_a if prob >= 0.5 else team_b
        msg = f"✅ Away Team ({team_a}) win probability: {prob:.4f}\n🏆 Predicted winner: {winner}"
        return msg, prob
    except Exception as e:
        return f"❌ Prediction error: {e}", None

def compare_embeddings(team_a, team_b, mlp_df, gru_df):
    """
    Shows cosine similarity for the selected teams using MLP (static) and latest GRU
    """
    try:
        # MLP similarity (static)
        mlp_a = mlp_df.loc[mlp_df['team'] == team_a, mlp_embedding_cols].values.flatten()
        mlp_b = mlp_df.loc[mlp_df['team'] == team_b, mlp_embedding_cols].values.flatten()

        # GRU similarity (latest available for each team)
        gru_a = gru_df.loc[gru_df['team'] == team_a].iloc[-1][gru_embedding_cols].values.flatten()
        gru_b = gru_df.loc[gru_df['team'] == team_b].iloc[-1][gru_embedding_cols].values.flatten()

        sim_mlp = cosine_similarity([mlp_a], [mlp_b])[0][0]
        sim_gru = cosine_similarity([gru_a], [gru_b])[0][0]

        plt.figure(figsize=(6, 2.8))
        plt.barh(['GRU Similarity', 'MLP Similarity'], [sim_gru, sim_mlp], color=['#2ca02c', '#ff7f0e'])
        plt.title("Team Embedding Similarity (Cosine)")
        plt.xlim(0, 1)
        plt.xlabel("Similarity")
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Similarity comparison failed: {e}")

In [10]:
# =========================
# SHAP (robust KernelExplainer)
# =========================
_shap_explainer = None
_shap_background = None

def _build_shap_background(gru_df, mlp_df, gru_cols, mlp_cols, all_weeks_sorted, n=30, seed=42):
    """
    Build ~n valid background rows for Kernel SHAP by randomly sampling
    (away_team, home_team, week) combos that actually have prev_week embeddings.
    """
    rng = np.random.default_rng(seed)
    # weeks that have a valid previous week
    weeks = [w for w in sorted(gru_df['seasonweek'].unique()) if get_previous_week(w, all_weeks_sorted)]
    teams = sorted(list(set(gru_df['team'].unique()) & set(mlp_df['team'].unique())))
    rows = []

    # keep sampling until we have n rows (or we run out)
    tries = 0
    while len(rows) < n and tries < n * 20 and weeks and len(teams) > 1:
        tries += 1
        w = rng.choice(weeks)
        a, b = rng.choice(teams), rng.choice(teams)
        if a == b:
            continue
        try:
            x, _ = build_feature_vector(a, b, w, mlp_df, gru_df, gru_cols, mlp_cols, all_weeks_sorted)
            rows.append(x[0])
        except Exception:
            # some combos might be missing prev_week embeddings; just skip
            pass

    if not rows:
        # Last-resort zero row with correct dimensionality
        d = len(gru_cols)*2 + len(mlp_cols)*2
        rows = [np.zeros(d, dtype=np.float32)]
    return np.vstack(rows).astype(np.float32)


def get_shap_explainer(model, gru_df, mlp_df, gru_cols, mlp_cols, all_weeks_sorted):
    """
    Always use KernelExplainer for robustness with TF 2.x / Keras Sequential.
    Memoizes the explainer/ background to avoid re-creating it each click.
    """
    global _shap_explainer, _shap_background
    if _shap_explainer is not None:
        return _shap_explainer, _shap_background

    bg = _build_shap_background(gru_df, mlp_df, gru_cols, mlp_cols, all_weeks_sorted, n=30)

    # KernelExplainer expects a prediction function that returns (n,1) or (n,) probabilities
    def f(X):
        # ensure 2D np array
        X = np.array(X, dtype=np.float32)
        return model.predict(X, verbose=0).flatten()

    _shap_explainer = shap.KernelExplainer(f, bg)
    _shap_background = bg
    return _shap_explainer, _shap_background


def plot_shap_waterfall(team_a, team_b, week,
                        model, mlp_df, gru_df, gru_cols, mlp_cols, all_weeks_sorted,
                        max_display=12, out_path=None):
    """
    Computes SHAP values for the current (away,home,week) input using Kernel SHAP
    and renders a waterfall plot.
    """
    # Build instance to explain
    x, fnames = build_feature_vector(team_a, team_b, week,
                                     mlp_df, gru_df, gru_cols, mlp_cols, all_weeks_sorted)

    explainer, bg = get_shap_explainer(model, gru_df, mlp_df, gru_cols, mlp_cols, all_weeks_sorted)

    # KernelExplainer returns a numpy array (n_samples, n_features)
    shap_vals = explainer.shap_values(x, nsamples=200)  # increase for fidelity (slower)
    # expected value is a scalar for binary-prob output
    base_val = explainer.expected_value

    if isinstance(shap_vals, list):
        shap_vals = shap_vals[0]   # safety if older SHAP returns lists
    shap_vals = shap_vals[0]       # (n_features,)

    exp = shap.Explanation(
        values=shap_vals,
        base_values=base_val,
        data=x[0],
        feature_names=fnames
    )

    plt.figure(figsize=(8, 6), dpi=150)
    shap.plots.waterfall(exp, max_display=max_display, show=False)
    plt.title(f"SHAP Waterfall — Away: {team_a} vs Home: {team_b} — {week//100} W{week%100}")
    plt.tight_layout()
    if out_path:
        plt.savefig(out_path, bbox_inches='tight')
    plt.show()

    return exp

In [11]:
# =========================
# Widgets (Away/Home + Week)
# =========================
# NOTE: Using MLP teams list as canonical team list
team_list = mlp_embedding_df['team'].tolist()

away_team_widget = widgets.Dropdown(
    options=team_list,
    description="Away Team:"
)
home_team_widget = widgets.Dropdown(
    options=team_list,
    description="Home Team:"
)

week_widget = widgets.Dropdown(
    options=[(f"{w // 100} Week {w % 100}", w) for w in week_choices],
    value=week_choices[0] if week_choices else None,
    description="Week:"
)

output = widgets.Output()

In [12]:
# =========================
# Button Handler
# =========================
def on_button_click(b):
    with output:
        clear_output()

        away = away_team_widget.value
        home = home_team_widget.value
        week = week_widget.value

        # 1) Prediction text
        msg, prob = predict_game(away, home, week, interaction_model,
                                 mlp_embedding_df, gru_embedding_df,
                                 gru_embedding_cols, mlp_embedding_cols)
        print(msg)

        # 2) Cosine similarity chart
        compare_embeddings(away, home, mlp_embedding_df, gru_embedding_df)

        # 3) SHAP waterfall
        print("\nSHAP Explanation (feature contributions to the prediction):")
        try:
            plot_shap_waterfall(away, home, week,
                                interaction_model,
                                mlp_embedding_df, gru_embedding_df,
                                gru_embedding_cols, mlp_embedding_cols,
                                all_weeks,
                                max_display=12)
        except Exception as e:
            print(f"SHAP explanation failed: {e}")


# =========================
# Launch UI
# =========================
predict_button = widgets.Button(description="Predict Game")
predict_button.on_click(on_button_click)

In [13]:
# ---------------------------
# Data Explorer (Tab 2): Load historical data
# ---------------------------
try:
    nfl_hist = pd.read_csv('hist_data_TAB.csv')
except Exception as e:
    print(f"Could not load hist_data_TAB.csv: {e}")
    nfl_hist = pd.DataFrame()

# Keep a clean copy; ensure Seasonweek & Team exist (used for plotting/filtering)
if not nfl_hist.empty:
    if "Seasonweek" not in nfl_hist.columns or "Team" not in nfl_hist.columns:
        print("⚠️ 'final_hist_data.csv' must contain 'Seasonweek' and 'Team' columns for the explorer.")
    # Identify numeric feature columns (drop things you don’t want in feature list)
    drop_cols = {"Seasonweek", "Result", "Team"}
    numeric_cols = [
        c for c in nfl_hist.columns
        if c not in drop_cols and pd.api.types.is_numeric_dtype(nfl_hist[c])
    ]
else:
    numeric_cols = []

In [14]:
# Ensure Seasonweek is consistently formatted as YYYYWW with two digits
def normalize_seasonweek(val):
    s = str(val)
    if len(s) == 5:  # e.g. '20249'
        return s[:4] + s[4:].zfill(2)  # '202409'
    return s

nfl_hist["Seasonweek"] = nfl_hist["Seasonweek"].apply(normalize_seasonweek).astype(int)

In [15]:
# ==============================
# Data Explorer with Week Range
# ==============================

# ---------- Helpers ----------
def format_seasonweek(sw: int) -> str:
    """Format 202409 -> '2024 W09'."""
    return f"{sw // 100} W{sw % 100:02d}"

def valid_week_ints(df):
    """Return sorted unique Seasonweek ints >= 202401 (adjust if you want earlier)."""
    weeks = sorted(df['Seasonweek'].unique())
    return [w for w in weeks if w >= 202201]

# Ensure a display-friendly x-axis label for Seasonweek
nfl_hist = nfl_hist.copy()
nfl_hist['Seasonweek_str'] = nfl_hist['Seasonweek'].apply(format_seasonweek)

# ---------- Widgets ----------
# Feature dropdown: all numeric columns except excluded
excluded_cols = {'Seasonweek', 'Seasonweek_str', 'Team', 'Result'}
numeric_cols = [c for c in nfl_hist.columns 
                if c not in excluded_cols and pd.api.types.is_numeric_dtype(nfl_hist[c])]

feature_select = widgets.Dropdown(
    options=numeric_cols,
    value=numeric_cols[0] if numeric_cols else None,
    description='Feature:'
)

team_options = ['All'] + sorted(nfl_hist['Team'].unique().tolist())
team_select = widgets.Dropdown(
    options=team_options,
    value='All',
    description='Team:'
)

# Week range dropdowns
week_values = valid_week_ints(nfl_hist)
start_week_widget = widgets.Dropdown(
    options=[(format_seasonweek(w), w) for w in week_values],
    value=week_values[0],
    description='Start Week:'
)
end_week_widget = widgets.Dropdown(
    options=[(format_seasonweek(w), w) for w in week_values],
    value=week_values[-1],
    description='End Week:'
)

# Output area
data_output = widgets.Output()

# ---------- Renderer ----------
def render_data_explorer(*_):
    with data_output:
        clear_output()
        feat = feature_select.value
        team = team_select.value
        w_start = start_week_widget.value
        w_end   = end_week_widget.value

        # Basic guardrails
        if feat is None:
            print("No numeric feature available to plot.")
            return
        if w_start is None or w_end is None:
            print("Please select both Start Week and End Week.")
            return
        if w_start > w_end:
            print("⚠️ Start Week must be <= End Week.")
            return

        # Filter by week range (and team, optionally)
        df_range = nfl_hist[(nfl_hist['Seasonweek'] >= w_start) & (nfl_hist['Seasonweek'] <= w_end)].copy()
        if df_range.empty:
            print("No data in the selected week range.")
            return

        if team != 'All':
            df_range = df_range[df_range['Team'] == team].copy()
            color_arg = None  # single line
            title_team = team
        else:
            color_arg = 'Team'
            title_team = 'All Teams'

        # Sort by Seasonweek for a clean line plot
        df_range = df_range.sort_values('Seasonweek')

        # Line chart
        fig = px.line(
            df_range,
            x='Seasonweek_str',
            y=feat,
            color=color_arg,
            title=f"{feat} — {title_team} — {format_seasonweek(w_start)} to {format_seasonweek(w_end)}",
            markers=True
        )
        fig.update_layout(
            xaxis_title="Seasonweek",
            yaxis_title=feat,
            legend_title="Team" if color_arg else None,
            height=480
        )
        display(fig)

        # Descriptive statistics (within filtered range)
        # If 'All' teams: we show stats for all rows; if specific team, stats for that team segment
        stats = df_range[feat].describe().to_frame().rename(columns={feat: 'Value'})
        print("Descriptive Statistics (filtered):")
        display(stats.T.style.format("{:.3f}"))

# ---------- Wire up events ----------
for w in (feature_select, team_select, start_week_widget, end_week_widget):
    w.observe(lambda change: render_data_explorer() if change['name'] == 'value' else None, names='value')

# ---------- Layout and initial render ----------
controls_row1 = HBox([feature_select, team_select])
controls_row2 = HBox([start_week_widget, end_week_widget])
data_explorer_ui = VBox([controls_row1, controls_row2, data_output])

# If you are using tabs, add this `data_explorer_ui` in your 'Data Explorer' tab content
#display(data_explorer_ui)
#render_data_explorer()


In [16]:
# ---------------------------
# Tab 1 (existing app) container
# ---------------------------
tab1_box = VBox([
    Label("Prediction & Explanations"),
    HBox([away_team_widget, home_team_widget, week_widget]),
    predict_button,
    output
])

# ---------------------------
# Tab control
# ---------------------------


tabs = widgets.Tab(children=[tab1_box, data_explorer_ui])
tabs.set_title(0, "Predict & Explain")
tabs.set_title(1, "Data Explorer")

display(tabs)

Tab(children=(VBox(children=(Label(value='Prediction & Explanations'), HBox(children=(Dropdown(description='Aw…