In [None]:
# üöÄ Auto-setup: installs deps + configures CFBD access
%run ./_auto_setup.py


In [None]:
# 05_matchup_predictor.ipynb
# üèà Simple Game Outcome Predictor Using EPA & Success Rate

# üõ† Requirements:
# - pandas, scikit-learn, matplotlib, seaborn (install via `pip install pandas scikit-learn matplotlib seaborn`)

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Import starter pack configuration system
import sys
from pathlib import Path
_config_dir = Path().resolve() / "config"
if str(_config_dir.parent) not in sys.path:
    sys.path.insert(0, str(_config_dir.parent))
from config.data_config import get_starter_pack_config

# Get configuration
config = get_starter_pack_config()
DATA_DIR = str(config.data_dir)

plt.style.use('fivethirtyeight')
plt.rcParams["figure.figsize"] = [15,8]


In [None]:
# üìÇ Load current year game results and season stats

games = pd.read_csv(os.path.join(DATA_DIR, "games.csv"))
stats = pd.read_csv(os.path.join(DATA_DIR, "advanced_season_stats", f"{config.current_year}.csv"))

# Filter to FBS-only matchups where both teams have data
games_current = games[(games["season"] == config.current_year) & (games["home_classification"] == 'fbs') & (games["away_classification"] == 'fbs')]
print(f"{len(games_current)} FBS vs. FBS games in {config.current_year}")

In [None]:
# üîó Join game data with team stats for both teams

# Prep stats
stats = stats[["team", "offense_ppa", "defense_ppa", "offense_successRate", "defense_successRate"]].copy()

# Merge home/away team stats
games_stats = games_current.merge(stats, left_on="home_team", right_on="team", how="inner").rename(columns={
    "offense_ppa": "home_offense_ppa", "defense_ppa": "home_defense_ppa",
    "offense_successRate": "home_offense_successRate", "defense_successRate": "home_defense_successRate"
}).drop("team", axis=1)

games_stats = games_stats.merge(stats, left_on="away_team", right_on="team", how="inner").rename(columns={
    "offense_ppa": "away_offense_ppa", "defense_ppa": "away_defense_ppa",
    "offense_successRate": "away_offense_successRate", "defense_successRate": "away_defense_successRate"
}).drop("team", axis=1)

print(f"{len(games_stats)} games with full stat coverage")


In [None]:
# üß† Feature engineering: create stat differentials

games_stats["ppa_diff"] = games_stats["home_offense_ppa"] - games_stats["away_defense_ppa"]
games_stats["ppa_allowed_diff"] = games_stats["home_defense_ppa"] - games_stats["away_offense_ppa"]
games_stats["successRate_diff"] = games_stats["home_offense_successRate"] - games_stats["away_defense_successRate"]
games_stats["successRate_allowed_diff"] = games_stats["home_defense_successRate"] - games_stats["away_offense_successRate"]

# Target: did home team win?
games_stats["home_win"] = (games_stats["home_points"] > games_stats["away_points"]).astype(int)

feature_cols = ["ppa_diff", "ppa_allowed_diff", "successRate_diff", "successRate_allowed_diff"]
X = games_stats[feature_cols]
y = games_stats["home_win"]


In [None]:
# ü§ñ Train/test split and model training

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {acc:.2%}")


In [None]:
# üìä Confusion matrix

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Loss", "Win"], yticklabels=["Loss", "Win"])
plt.title("Confusion Matrix: Home Team Win Prediction")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# üîç Predict a specific matchup (example)

def predict_matchup(home_team, away_team):
    home = stats[stats["team"] == home_team].iloc[0]
    away = stats[stats["team"] == away_team].iloc[0]

    data = {
        "ppa_diff": home["offense_ppa"] - away["defense_ppa"],
        "ppa_allowed_diff": home["defense_ppa"] - away["offense_ppa"],
        "successRate_diff": home["offense_successRate"] - away["defense_successRate"],
        "successRate_allowed_diff": home["defense_successRate"] - away["offense_successRate"]
    }

    input_df = pd.DataFrame([data])
    prob = model.predict_proba(input_df)[0][1]
    print(f"Probability {home_team} beats {away_team}: {prob:.2%}")

# Example
predict_matchup("Michigan", "Ohio State")


In [None]:
# ‚úÖ Summary

# In this notebook, you:
# - Joined team stats with game outcomes
# - Built a basic logistic regression model using stat differentials
# - Predicted outcomes and evaluated model accuracy
# - Created a reusable function to simulate future matchups

# üß™ Try This:
# - Add features like talent composite, pass/rush rate, tempo
# - Use different models (RandomForest, XGBoost)
# - Predict score differential instead of binary win/loss

# üîó Bridge to Model Pack

## üöÄ Next Steps: From Basic Prediction to Machine Learning

Congratulations! You've built a basic prediction model. Here's how to take it to the next level:

### What You Just Learned ‚Üí ML Features

Your starter pack model uses:
- `ppa_diff` ‚Üí In model pack, this becomes `home_adjusted_epa - away_adjusted_epa_allowed`
- `successRate_diff` ‚Üí In model pack, these are `home_adjusted_success` and `away_adjusted_success`
- Basic stats ‚Üí Model pack uses **86 opponent-adjusted features** for better accuracy

### Explore Weekly Training Data

See how your concepts become ML features:

```python
# Load weekly training data to see the 86-feature format
import pandas as pd
weekly = pd.read_csv('../training_data_2025_week01.csv')
print(f"Week 1: {len(weekly)} games, {len(weekly.columns)} features")

# Compare your features to ML features
print("\nYour features ‚Üí ML equivalent:")
print("  ppa_diff ‚Üí home_adjusted_epa, away_adjusted_epa_allowed")
print("  successRate_diff ‚Üí home_adjusted_success, away_adjusted_success")

# See the actual ML features
print("\nSample ML features:")
print(weekly[['home_adjusted_epa', 'away_adjusted_epa', 
              'home_adjusted_success', 'away_adjusted_success',
              'home_elo', 'away_elo', 'spread']].head())
```

### Try Model Pack

1. **Linear Regression Model**: `../model_pack/01_linear_regression_margin.ipynb`
   - Uses your concepts but with opponent-adjusted features
   - Predicts score margin instead of just win/loss
   - Better accuracy with 86 features!

2. **XGBoost Model**: `../model_pack/03_xgboost_win_probability.ipynb`
   - Advanced ML model using all 86 features
   - Provides win probabilities and confidence intervals

### Get Agent Guidance

Use the Learning Navigator Agent for personalized help:

```python
from agents.analytics_orchestrator import AnalyticsOrchestrator, AnalyticsRequest

orchestrator = AnalyticsOrchestrator()
request = AnalyticsRequest(
    user_id='your_id',
    query='Bridge me from matchup predictor to model pack',
    query_type='learning',
    parameters={'current_notebook': '05_matchup_predictor.ipynb'},
    context_hints={'role': 'data_scientist'}
)
response = orchestrator.process_analytics_request(request)
print(response.insights)
```

### Key Differences

| Starter Pack (This Notebook) | Model Pack |
|------------------------------|------------|
| 4 basic features | 86 opponent-adjusted features |
| 55% accuracy | 65-70% accuracy |
| Basic stats | Schedule-adjusted metrics |
| Win/loss prediction | Margin + win probability |
| Current season only | 2016-2025 (Week 5+) |

### Recommended Learning Path

1. ‚úÖ **You Are Here**: Basic prediction with simple features
2. **Next**: Explore `../training_data_2025_week01.csv` to see feature format
3. **Then**: Try `../model_pack/01_linear_regression_margin.ipynb`
4. **Finally**: Understand feature importance in `../model_pack/06_shap_interpretability.ipynb`

---

üí° **Tip**: The weekly training data files (`training_data_2025_week*.csv`) show exactly how
starter pack metrics become the 86 features used in ML models. Week 5+ data is used for
temporal validation (no future data leakage).
