# A Bayesian Analysis of Team Performance Metrics in Major League Baseball

by Mecchia Alessandro and Sergio Fernandez Diz

## 1. Imports

In [None]:
import os
os.environ["PYTENSOR_FLAGS"] = "cxx="

from pybaseball import statcast
import pandas as pd
from pybaseball import cache
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pybaseball import playerid_reverse_lookup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import plotly.graph_objects as go
import seaborn as sns
import base64
import codecs
import pymc as pm
import arviz as az
import statsmodels.api as sm
from IPython.display import display
import statsmodels.api as sm
import statsmodels.formula.api as smf

from itables import init_notebook_mode,show
init_notebook_mode(all_interactive=True)


import plotly.io as pio
pio.renderers.default = "notebook_connected"

az.style.use("arviz-doc")


## 2. Introduction

Baseball is a team sport in which two teams alternate between offense and defense: the goal is to hit the ball with a bat and score points by running between four bases. It is a game heavily based on strategy, individual skill, and statistics, especially in the pitcher-batter matchup. It is one of the most data-driven sports, in which every action generates detailed statistics used to evaluate performance, strategies, and probabilities of success.

### Moneyball and the Rise of Sabermetrics

In 2003, the book ```Moneyball: The Art of Winning an Unfair Game``` was published about the Oakland Athletics baseball team and its general manager Billy Beane. It describes the team's sabermetric approach to assembling a competitive baseball team on a small budget. The central premise of Moneyball is that the collective wisdom of baseball insiders (including players, managers, coaches, scouts, and executives) over the past century is outdated, subjective, and often wrong, and that the statistics traditionally used to evaluate players, such as stolen bases, runs batted in, and batting average, are relics of a 19th-century view of the game. Sabermetrics and statistical analysis have shown, for example, that on-base percentage and slugging percentage are better measures of hitting. The Oakland A's began looking for players who were ‚Äúundervalued by the market,‚Äù meaning they were receiving salaries lower than their ability to contribute to wins, as measured by these advanced statistics.This approach proved to be highly effective, as the Athletics achieved sustained competitive success despite operating with one of the lowest payrolls in Major League Baseball.

Wikipedia contributors. (2024). Moneyball. [In Wikipedia, The Free Encyclopedia](https://en.wikipedia.org/wiki/Moneyball)

In this project, we use a Bayesian inference approach to analyze metrics related to the Moneyball philosophy, both at the team and individual player level. We compare traditional statistics with advanced metrics such as OBP, SLG, and OPS, evaluating their ability to explain and predict success in terms of wins. The use of Bayesian models allows us to quantify uncertainty and update performance estimates in a manner consistent with the dynamic nature of the game.

## 3. Data Collection

To obtain the datasets used in this project, we relied primarily on pybaseball, a Python library that automates data collection from major public baseball databases such as Baseball Reference, Baseball Savant, and FanGraphs. The library provides access to a wide range of data, including Statcast pitch-level data, batting and pitching statistics, team standings, awards, and other advanced metrics, both at the individual pitch level and in aggregated form over seasons or custom time periods.
In addition, team-level data for the annual period from 2020 to 2024 were manually collected from [Baseball-Reference](https://www.baseball-reference.com) to complement and validate the datasets obtained through the library.

### 3.1 Event Statcast

The statcast function retrieves pitch-level statcast data for a given date or range or dates.

In [None]:
cache.enable()

In [None]:
#df = statcast(start_dt='2025-03-01', end_dt='2025-10-30')
df = pd.read_parquet("data/statcast_2025.parquet")
#df.to_parquet("statcast_2025.parquet", compression="snappy")

### 3.2 Players

In [None]:
# Priors: we use the pitcher_stats and batting_stats functions to obtain performance data for pitchers and batters.
# We also use them for EDA. Later, we will use Statcast to obtain more detailed data on specific games.
#batter_stats = pb.batting_stats_bref(2024)
#pitcher_stats = pb.pitching_stats_bref(2024)
batter_stats = pd.read_csv('data/batter_stats_2024.csv')
pitcher_stats = pd.read_csv('data/pitcher_stats_2024.csv')
#batter_stats.head()

## 4. Data Exploration

### 4.1 Data Description

#### Statcast dataset

<style>
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        margin-top: 10px;
    }
    .pitch-table th, .pitch-table td {
        border: 1px solid #ddd;
        padding: 8px;
        text-align: left;
    }
    .pitch-table th {
        background-color: #f8f9fa;
        font-weight: bold;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px; border: 1px solid #dee2e6;">
        Pitch Characteristics
    </summary>
    <div style="overflow-x: auto;">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Variable</th>
                    <th>Description</th>
                    <th>Level</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Pitch</td>
                    <td><code>pitch_type</code></td>
                    <td>Type of pitch thrown (e.g. FF, SL, CH)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Pitch</td>
                    <td><code>release_speed</code></td>
                    <td>Pitch velocity at release (mph)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Pitch</td>
                    <td><code>effective_speed</code></td>
                    <td>Effective pitch velocity</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Pitch</td>
                    <td><code>release_spin_rate</code></td>
                    <td>Spin rate at release (rpm)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Pitch</td>
                    <td><code>spin_axis</code></td>
                    <td>Axis of spin of the pitch</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Pitch</td>
                    <td><code>release_extension</code></td>
                    <td>Distance from rubber at release (ft)</td>
                    <td>Pitch</td>
                </tr>
            </tbody>
        </table>
    </div>
</details>

<style>
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        margin-top: 10px;
    }
    .pitch-table th, .pitch-table td {
        border: 1px solid #ddd;
        padding: 8px;
        text-align: left;
    }
    .pitch-table th {
        background-color: #f8f9fa;
        font-weight: bold;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px; border: 1px solid #dee2e6;">
        Ball Movement and Location
    </summary>
    <div style="overflow-x: auto;">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Variable</th>
                    <th>Description</th>
                    <th>Level</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Movement</td>
                    <td><code>pfx_x</code></td>
                    <td>Horizontal pitch movement (inches)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Movement</td>
                    <td><code>pfx_z</code></td>
                    <td>Vertical pitch movement (inches)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Location</td>
                    <td><code>plate_x</code></td>
                    <td>Horizontal location at plate</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Location</td>
                    <td><code>plate_z</code></td>
                    <td>Vertical location at plate</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Zone</td>
                    <td><code>zone</code></td>
                    <td>Strike zone location (1‚Äì14)</td>
                    <td>Pitch</td>
                </tr>
            </tbody>
        </table>
    </div>
</details>


<style>
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        margin-top: 10px;
    }
    .pitch-table th, .pitch-table td {
        border: 1px solid #ddd;
        padding: 8px;
        text-align: left;
    }
    .pitch-table th {
        background-color: #f8f9fa;
        font-weight: bold;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px; border: 1px solid #dee2e6;">
        Batted Ball and Contact
    </summary>
    <div style="overflow-x: auto;">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Variable</th>
                    <th>Description</th>
                    <th>Level</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Contact</td>
                    <td><code>launch_speed</code></td>
                    <td>Exit velocity of batted ball (mph)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Contact</td>
                    <td><code>launch_angle</code></td>
                    <td>Launch angle (degrees)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Contact</td>
                    <td><code>hit_distance_sc</code></td>
                    <td>Estimated hit distance (feet)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Contact</td>
                    <td><code>bb_type</code></td>
                    <td>Batted ball type (groundball, flyball, etc.)</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Contact</td>
                    <td><code>launch_speed_angle</code></td>
                    <td>Launch speed/angle classification</td>
                    <td>Pitch</td>
                </tr>
            </tbody>
        </table>
    </div>
</details>


<style>
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        margin-top: 10px;
    }
    .pitch-table th, .pitch-table td {
        border: 1px solid #ddd;
        padding: 8px;
        text-align: left;
    }
    .pitch-table th {
        background-color: #f8f9fa;
        font-weight: bold;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px; border: 1px solid #dee2e6;">
        Expected and Value Metrics
    </summary>
    <div style="overflow-x: auto;">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Variable</th>
                    <th>Description</th>
                    <th>Level</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Expected</td>
                    <td><code>estimated_ba_using_speedangle</code></td>
                    <td>Expected batting average</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Expected</td>
                    <td><code>estimated_woba_using_speedangle</code></td>
                    <td>Expected wOBA</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Expected</td>
                    <td><code>estimated_slg_using_speedangle</code></td>
                    <td>Expected slugging percentage</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Value</td>
                    <td><code>woba_value</code></td>
                    <td>Actual wOBA value</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Value</td>
                    <td><code>iso_value</code></td>
                    <td>Isolated power (ISO) value</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Value</td>
                    <td><code>babip_value</code></td>
                    <td>Batting average on balls in play</td>
                    <td>Pitch</td>
                </tr>
            </tbody>
        </table>
    </div>
</details>


<style>
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        margin-top: 10px;
    }
    .pitch-table th, .pitch-table td {
        border: 1px solid #ddd;
        padding: 8px;
        text-align: left;
    }
    .pitch-table th {
        background-color: #f8f9fa;
        font-weight: bold;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px; border: 1px solid #dee2e6;">
        Game Context
    </summary>
    <div style="overflow-x: auto;">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Variable</th>
                    <th>Description</th>
                    <th>Level</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Game</td>
                    <td><code>game_date</code></td>
                    <td>Date of the game</td>
                    <td>Game</td>
                </tr>
                <tr>
                    <td>Game</td>
                    <td><code>inning</code></td>
                    <td>Inning number</td>
                    <td>Game</td>
                </tr>
                <tr>
                    <td>Game</td>
                    <td><code>inning_topbot</code></td>
                    <td>Top or bottom of the inning</td>
                    <td>Game</td>
                </tr>
                <tr>
                    <td>Game</td>
                    <td><code>balls</code></td>
                    <td>Balls count</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Game</td>
                    <td><code>strikes</code></td>
                    <td>Strikes count</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Game</td>
                    <td><code>outs_when_up</code></td>
                    <td>Number of outs</td>
                    <td>Pitch</td>
                </tr>
            </tbody>
        </table>
    </div>
</details>


<style>
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        margin-top: 10px;
    }
    .pitch-table th, .pitch-table td {
        border: 1px solid #ddd;
        padding: 8px;
        text-align: left;
    }
    .pitch-table th {
        background-color: #f8f9fa;
        font-weight: bold;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px; border: 1px solid #dee2e6;">
        Player Information
    </summary>
    <div style="overflow-x: auto;">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Variable</th>
                    <th>Description</th>
                    <th>Level</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Player</td>
                    <td><code>batter</code></td>
                    <td>Batter ID</td>
                    <td>Player</td>
                </tr>
                <tr>
                    <td>Player</td>
                    <td><code>pitcher</code></td>
                    <td>Pitcher ID</td>
                    <td>Player</td>
                </tr>
                <tr>
                    <td>Player</td>
                    <td><code>stand</code></td>
                    <td>Batter handedness</td>
                    <td>Player</td>
                </tr>
                <tr>
                    <td>Player</td>
                    <td><code>p_throws</code></td>
                    <td>Pitcher handedness</td>
                    <td>Player</td>
                </tr>
                <tr>
                    <td>Player</td>
                    <td><code>age_bat</code></td>
                    <td>Batter age</td>
                    <td>Player</td>
                </tr>
                <tr>
                    <td>Player</td>
                    <td><code>age_pit</code></td>
                    <td>Pitcher age</td>
                    <td>Player</td>
                </tr>
            </tbody>
        </table>
    </div>
</details>



<style>
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        margin-top: 10px;
    }
    .pitch-table th, .pitch-table td {
        border: 1px solid #ddd;
        padding: 8px;
        text-align: left;
    }
    .pitch-table th {
        background-color: #f8f9fa;
        font-weight: bold;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px; border: 1px solid #dee2e6;">
        Win and Run Expectancy
    </summary>
    <div style="overflow-x: auto;">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Variable</th>
                    <th>Description</th>
                    <th>Level</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Expectancy</td>
                    <td><code>delta_run_exp</code></td>
                    <td>Change in run expectancy</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Expectancy</td>
                    <td><code>delta_home_win_exp</code></td>
                    <td>Change in home win probability</td>
                    <td>Pitch</td>
                </tr>
                <tr>
                    <td>Expectancy</td>
                    <td><code>home_win_exp</code></td>
                    <td>Home team win probability</td>
                    <td>Game</td>
                </tr>
                <tr>
                    <td>Expectancy</td>
                    <td><code>bat_win_exp</code></td>
                    <td>Batting team win probability</td>
                    <td>Game</td>
                </tr>
            </tbody>
        </table>
    </div>
</details>


#### 4.1.1 Event Statcast

In [None]:
df.info()

In [None]:
print(df.columns.to_list())

spiegare le variabili principali che useremo, nome del battitore e del lanciatore codificati dagli ID

In [None]:
batter_ids = df['batter'].dropna().astype(int).unique()

In [None]:
batters = playerid_reverse_lookup(batter_ids, key_type='mlbam')

batters = batters[['key_mlbam', 'name_first', 'name_last']].copy()
batters['batter_name'] = batters['name_first'].str.title() + ', ' + batters['name_last']

batters = batters[['key_mlbam', 'batter_name']]
batters.columns = ['batter_id', 'batter_name']
display(batters.head(2))

In [None]:
id_to_name = dict(zip(batters['batter_id'], batters['batter_name']))

df['batter_name'] = df['batter'].map(id_to_name)
df = df.rename(columns={"player_name": "pitcher_name"})

df.head(2)

In [None]:
categorical_cols = {}
numerical_cols = {}

categorical_cols["df"] = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
numerical_cols["df"]   = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

#### 4.1.2 Players

##### 4.1.2.1 Pitchers

<style>
    .pitch-container {
        max-height: 350px;
        overflow-y: auto;
        overflow-x: auto;
        border: 1px solid #dee2e6;
        border-radius: 5px;
    }
    .pitch-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        font-size: 0.9em;
    }
    .pitch-table th {
        position: sticky;
        top: 0;
        background-color: #f8f9fa;
        z-index: 10;
        border-bottom: 2px solid #dee2e6;
        padding: 10px;
        text-align: left;
    }
    .pitch-table td {
        padding: 8px;
        border-bottom: 1px solid #eee;
    }
    .pitch-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px;">
        Pitching Statistics
    </summary>
    <div class="pitch-container">
        <table class="pitch-table">
            <thead>
                <tr>
                    <th>Variable</th>
                    <th>Description</th>
                </tr>
            </thead>
            <tbody>
                <tr><td><code>Name</code></td><td>Player name</td></tr>
                <tr><td><code>Lev</code></td><td>Competition level (MLB)</td></tr>
                <tr><td><code>Tm</code></td><td>Team</td></tr>
                <tr><td><code>Age</code></td><td>Player age</td></tr>
                <tr><td><code>#days</code></td><td>Days on active roster</td></tr>
                <tr><td><code>G</code></td><td>Games pitched</td></tr>
                <tr><td><code>GS</code></td><td>Games started</td></tr>
                <tr><td><code>W</code></td><td>Wins</td></tr>
                <tr><td><code>L</code></td><td>Losses</td></tr>
                <tr><td><code>SV</code></td><td>Saves</td></tr>
                <tr><td><code>IP</code></td><td>Innings pitched</td></tr>
                <tr><td><code>H</code></td><td>Hits allowed</td></tr>
                <tr><td><code>R</code></td><td>Runs allowed</td></tr>
                <tr><td><code>ER</code></td><td>Earned runs</td></tr>
                <tr><td><code>BB</code></td><td>Walks allowed</td></tr>
                <tr><td><code>SO</code></td><td>Strikeouts</td></tr>
                <tr><td><code>HR</code></td><td>Home runs allowed</td></tr>
                <tr><td><code>HBP</code></td><td>Hit batters</td></tr>
                <tr><td><code>ERA</code></td><td>Earned run average</td></tr>
                <tr><td><code>AB</code></td><td>At-bats against</td></tr>
                <tr><td><code>2B</code></td><td>Doubles allowed</td></tr>
                <tr><td><code>3B</code></td><td>Triples allowed</td></tr>
                <tr><td><code>IBB</code></td><td>Intentional walks</td></tr>
                <tr><td><code>GDP</code></td><td>Grounded into double plays</td></tr>
                <tr><td><code>SF</code></td><td>Sacrifice flies allowed</td></tr>
                <tr><td><code>SB</code></td><td>Stolen bases allowed</td></tr>
                <tr><td><code>CS</code></td><td>Caught stealing</td></tr>
                <tr><td><code>PO</code></td><td>Pickoffs</td></tr>
                <tr><td><code>BF</code></td><td>Batters faced</td></tr>
                <tr><td><code>Pit</code></td><td>Total pitches</td></tr>
                <tr><td><code>Str</code></td><td>Total strikes</td></tr>
                <tr><td><code>StL</code></td><td>Looking strikes</td></tr>
                <tr><td><code>StS</code></td><td>Swinging strikes</td></tr>
                <tr><td><code>GB/FB</code></td><td>Ground ball to fly ball ratio</td></tr>
                <tr><td><code>LD</code></td><td>Line drives</td></tr>
                <tr><td><code>PU</code></td><td>Pop-ups</td></tr>
                <tr><td><code>WHIP</code></td><td>Walks plus hits per inning pitched</td></tr>
                <tr><td><code>BAbip</code></td><td>Batting average on balls in play</td></tr>
                <tr><td><code>SO9</code></td><td>Strikeouts per 9 innings</td></tr>
                <tr><td><code>SO/W</code></td><td>Strikeout-to-walk ratio</td></tr>
                <tr><td><code>mlbID</code></td><td>Unique MLB player identifier</td></tr>
            </tbody>
        </table>
    </div>
</details>

In [None]:
print(f"Pitcher stats:")
pitcher_stats.head(1)

##### 4.1.2.2 Batters

<style>
    .batter-container {
        max-height: 350px;
        overflow-y: auto;
        overflow-x: auto;
        border: 1px solid #dee2e6;
        border-radius: 5px;
        margin-top: 10px;
    }
    .batter-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        font-size: 0.9em;
    }
    .batter-table th {
        position: sticky;
        top: 0;
        background-color: #f8f9fa;
        z-index: 10;
        border-bottom: 2px solid #dee2e6;
        padding: 10px;
        text-align: left;
    }
    .batter-table td {
        padding: 8px;
        border-bottom: 1px solid #eee;
    }
    .batter-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px;">
        Batter Statistics
    </summary>
    <div class="batter-container">
        <table class="batter-table">
            <thead>
                <tr>
                    <th>Variable</th>
                    <th>Description</th>
                </tr>
            </thead>
            <tbody>
                <tr><td><code>Name</code></td><td>Player name</td></tr>
                <tr><td><code>Lev</code></td><td>Competition level (MLB)</td></tr>
                <tr><td><code>Tm</code></td><td>Team</td></tr>
                <tr><td><code>Age</code></td><td>Player age</td></tr>
                <tr><td><code>#days</code></td><td>Days on active roster</td></tr>
                <tr><td><code>G</code></td><td>Games played</td></tr>
                <tr><td><code>PA</code></td><td>Plate appearances</td></tr>
                <tr><td><code>AB</code></td><td>At-bats</td></tr>
                <tr><td><code>R</code></td><td>Runs scored</td></tr>
                <tr><td><code>H</code></td><td>Hits</td></tr>
                <tr><td><code>2B</code></td><td>Doubles</td></tr>
                <tr><td><code>3B</code></td><td>Triples</td></tr>
                <tr><td><code>HR</code></td><td>Home runs</td></tr>
                <tr><td><code>RBI</code></td><td>Runs batted in</td></tr>
                <tr><td><code>BB</code></td><td>Walks</td></tr>
                <tr><td><code>IBB</code></td><td>Intentional walks</td></tr>
                <tr><td><code>SO</code></td><td>Strikeouts</td></tr>
                <tr><td><code>HBP</code></td><td>Hit by pitch</td></tr>
                <tr><td><code>SH</code></td><td>Sacrifice hits</td></tr>
                <tr><td><code>SF</code></td><td>Sacrifice flies</td></tr>
                <tr><td><code>GDP</code></td><td>Grounded into double plays</td></tr>
                <tr><td><code>SB</code></td><td>Stolen bases</td></tr>
                <tr><td><code>CS</code></td><td>Caught stealing</td></tr>
                <tr><td><code>BA</code></td><td>Batting average</td></tr>
                <tr><td><code>OBP</code></td><td>On-base percentage</td></tr>
                <tr><td><code>SLG</code></td><td>Slugging percentage</td></tr>
                <tr><td><code>OPS</code></td><td>On-base plus slugging</td></tr>
                <tr><td><code>mlbID</code></td><td>Unique MLB player identifier</td></tr>
            </tbody>
        </table>
    </div>
</details>

In [None]:
print(f"Batter stats:")
batter_stats.head(1)

In [None]:
categorical_cols["batter_stats"] = batter_stats.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
numerical_cols["batter_stats"]   = batter_stats.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols["pitcher_stats"] = pitcher_stats.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
numerical_cols["pitcher_stats"]   = pitcher_stats.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("BATTER_STATS")
print("Categoriche:", categorical_cols["batter_stats"])
print("Numeriche:", numerical_cols["batter_stats"])

print("\nPITCHER_STATS")
print("Categoriche:", categorical_cols["pitcher_stats"])
print("Numeriche:", numerical_cols["pitcher_stats"])

#### 4.1.3 Teams

##### 4.1.3.1 Team Batting

<style>
    .team-batting-container {
        max-height: 350px;
        overflow-y: auto;
        overflow-x: auto;
        border: 1px solid #dee2e6;
        border-radius: 5px;
        margin-top: 10px;
    }
    .team-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        font-size: 0.9em;
    }
    .team-table th {
        position: sticky;
        top: 0;
        background-color: #f8f9fa;
        z-index: 10;
        border-bottom: 2px solid #dee2e6;
        padding: 10px;
        text-align: left;
    }
    .team-table td {
        padding: 8px;
        border-bottom: 1px solid #eee;
    }
    .team-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px;">
        Team Batting Statistics
    </summary>
    <div class="team-batting-container">
        <table class="team-table">
            <thead>
                <tr>
                    <th>Variable</th>
                    <th>Description</th>
                </tr>
            </thead>
            <tbody>
                <tr><td><code>#Bat</code></td><td>Number of batters used</td></tr>
                <tr><td><code>BatAge</code></td><td>Average batter age</td></tr>
                <tr><td><code>R/G</code></td><td>Runs scored per game</td></tr>
                <tr><td><code>G</code></td><td>Games played</td></tr>
                <tr><td><code>PA</code></td><td>Plate appearances</td></tr>
                <tr><td><code>AB</code></td><td>At-bats</td></tr>
                <tr><td><code>R</code></td><td>Runs scored</td></tr>
                <tr><td><code>H</code></td><td>Hits</td></tr>
                <tr><td><code>2B</code></td><td>Doubles</td></tr>
                <tr><td><code>3B</code></td><td>Triples</td></tr>
                <tr><td><code>HR</code></td><td>Home runs</td></tr>
                <tr><td><code>RBI</code></td><td>Runs batted in</td></tr>
                <tr><td><code>SB</code></td><td>Stolen bases</td></tr>
                <tr><td><code>CS</code></td><td>Caught stealing</td></tr>
                <tr><td><code>BB</code></td><td>Walks</td></tr>
                <tr><td><code>SO</code></td><td>Strikeouts</td></tr>
                <tr><td><code>BA</code></td><td>Batting average</td></tr>
                <tr><td><code>OBP</code></td><td>On-base percentage</td></tr>
                <tr><td><code>SLG</code></td><td>Slugging percentage</td></tr>
                <tr><td><code>OPS</code></td><td>On-base plus slugging</td></tr>
                <tr><td><code>OPS+</code></td><td>Park-adjusted OPS (league average = 100)</td></tr>
                <tr><td><code>TB</code></td><td>Total bases</td></tr>
                <tr><td><code>GDP</code></td><td>Grounded into double plays</td></tr>
                <tr><td><code>HBP</code></td><td>Hit by pitch</td></tr>
                <tr><td><code>SH</code></td><td>Sacrifice hits</td></tr>
                <tr><td><code>SF</code></td><td>Sacrifice flies</td></tr>
                <tr><td><code>IBB</code></td><td>Intentional walks</td></tr>
                <tr><td><code>LOB</code></td><td>Left on base</td></tr>
            </tbody>
        </table>
    </div>
</details>

In [None]:
data_folder = 'data/'
team_batter_stats = pd.DataFrame()
for file in os.listdir(data_folder):
    if file.startswith('team_batter_stats_') and file.endswith('.csv'):
        year = file.split('_')[-1].split('.')[0]
        temp_df = pd.read_csv(os.path.join(data_folder, file))
        temp_df['year'] = int(year)
        team_batter_stats = pd.concat([team_batter_stats, temp_df], ignore_index=True)
team_batter_stats.head(2)

In [None]:
# remove the rows League Average and NaN teams
team_batter_stats = team_batter_stats[(~team_batter_stats["Tm"].isin(["League Average"])) &
    (team_batter_stats["Tm"].notna())
]

##### 4.1.3.2 Team Pitching

<style>
    .team-pitching-container {
        max-height: 350px;
        overflow-y: auto;
        overflow-x: auto;
        border: 1px solid #dee2e6;
        border-radius: 5px;
        margin-top: 10px;
    }
    .team-p-table {
        width: 100%;
        border-collapse: collapse;
        font-family: sans-serif;
        font-size: 0.9em;
    }
    .team-p-table th {
        position: sticky;
        top: 0;
        background-color: #f8f9fa;
        z-index: 10;
        border-bottom: 2px solid #dee2e6;
        padding: 10px;
        text-align: left;
    }
    .team-p-table td {
        padding: 8px;
        border-bottom: 1px solid #eee;
    }
    .team-p-table tr:nth-child(even) {
        background-color: #fcfcfc;
    }
</style>

<details>
    <summary style="cursor: pointer; font-weight: bold; padding: 10px; background: #24caa636; border-radius: 5px;">
        Team Pitching Statistics
    </summary>
    <div class="team-pitching-container">
        <table class="team-p-table">
            <thead>
                <tr>
                    <th>Variable</th>
                    <th>Description</th>
                </tr>
            </thead>
            <tbody>
                <tr><td><code>#P</code></td><td>Number of pitchers used</td></tr>
                <tr><td><code>PAge</code></td><td>Average pitcher age</td></tr>
                <tr><td><code>RA/G</code></td><td>Runs allowed per game</td></tr>
                <tr><td><code>W</code></td><td>Wins</td></tr>
                <tr><td><code>L</code></td><td>Losses</td></tr>
                <tr><td><code>W-L%</code></td><td>Win‚Äìloss percentage</td></tr>
                <tr><td><code>ERA</code></td><td>Earned run average</td></tr>
                <tr><td><code>G</code></td><td>Games pitched</td></tr>
                <tr><td><code>GS</code></td><td>Games started</td></tr>
                <tr><td><code>GF</code></td><td>Games finished</td></tr>
                <tr><td><code>CG</code></td><td>Complete games</td></tr>
                <tr><td><code>tSho</code></td><td>Team shutouts</td></tr>
                <tr><td><code>cSho</code></td><td>Combined shutouts</td></tr>
                <tr><td><code>SV</code></td><td>Saves</td></tr>
                <tr><td><code>IP</code></td><td>Innings pitched</td></tr>
                <tr><td><code>H</code></td><td>Hits allowed</td></tr>
                <tr><td><code>R</code></td><td>Runs allowed</td></tr>
                <tr><td><code>ER</code></td><td>Earned runs</td></tr>
                <tr><td><code>HR</code></td><td>Home runs allowed</td></tr>
                <tr><td><code>BB</code></td><td>Walks allowed</td></tr>
                <tr><td><code>IBB</code></td><td>Intentional walks</td></tr>
                <tr><td><code>SO</code></td><td>Strikeouts</td></tr>
                <tr><td><code>HBP</code></td><td>Hit batters</td></tr>
                <tr><td><code>BK</code></td><td>Balks</td></tr>
                <tr><td><code>WP</code></td><td>Wild pitches</td></tr>
                <tr><td><code>BF</code></td><td>Batters faced</td></tr>
                <tr><td><code>ERA+</code></td><td>Park-adjusted ERA (league average = 100)</td></tr>
                <tr><td><code>FIP</code></td><td>Fielding independent pitching</td></tr>
                <tr><td><code>WHIP</code></td><td>Walks plus hits per inning pitched</td></tr>
                <tr><td><code>H9</code></td><td>Hits allowed per 9 innings</td></tr>
                <tr><td><code>HR9</code></td><td>Home runs allowed per 9 innings</td></tr>
                <tr><td><code>BB9</code></td><td>Walks allowed per 9 innings</td></tr>
                <tr><td><code>SO9</code></td><td>Strikeouts per 9 innings</td></tr>
                <tr><td><code>SO/W</code></td><td>Strikeout-to-walk ratio</td></tr>
                <tr><td><code>LOB</code></td><td>Left on base</td></tr>
            </tbody>
        </table>
    </div>
</details>

In [None]:
data_folder = 'data/'
team_pitcher_stats = pd.DataFrame()
for file in os.listdir(data_folder):
    if file.startswith('team_pitcher_stats_') and file.endswith('.csv'):
        year = file.split('_')[-1].split('.')[0]
        temp_df = pd.read_csv(os.path.join(data_folder, file))
        temp_df['year'] = int(year)
        team_pitcher_stats = pd.concat([team_pitcher_stats, temp_df], ignore_index=True)
team_pitcher_stats.head(2)

In [None]:
# remove the rows League Average and NaN teams
team_pitcher_stats = team_pitcher_stats[(~team_pitcher_stats["Tm"].isin(["League Average"])) &
    (team_pitcher_stats["Tm"].notna())
]

### 4.2 Missing Values

In [None]:
missing_all = pd.concat({
    "DF": df.isna().sum(),
    "BATTER_STATS": batter_stats.isna().sum(),
    "PITCHER_STATS": pitcher_stats.isna().sum(),
    "TEAM_BATTER_STATS": team_batter_stats.isna().sum(),
    "TEAM_PITCHER_STATS": team_pitcher_stats.isna().sum(),
}, axis=1)

missing_all = missing_all.loc[missing_all.sum(axis=1) > 0].sort_values(by="DF", ascending=False)


In [None]:
show(missing_all)

### 4.3 Data Visualization

da come abbiamo visto il dataset contiene molte feauture, molte delle quali sono continue e altre categoriche, siccome visualizzarle tutte non √® possibile ho deciso di visualizzare 4 categoriche e 4 continue.

In [None]:
plots = [
    (df, "pitch_type", "df: pitch_type"),
    (df, "events",     "df: events"),
    (df, "stand",      "df: stand"),
    (df, "home_team",  "df: home_team"),
    (batter_stats, "Lev", "batter: Lev"),
    (batter_stats, "Tm",  "batter: Tm"),
    (pitcher_stats, "Lev", "pitcher: Lev"),
    (pitcher_stats, "Tm",  "pitcher: Tm"),
]

sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(2, 4, figsize=(22, 10))
axes = axes.flatten()

colors = ["#5D9BCE"]*4 + ["#79BC8F"]*2 + ["#F4A470"]*2

for i, (ax, (data, col, title)) in enumerate(zip(axes, plots)):
    vc = data[col].value_counts(dropna=False).head(15).reset_index()
    
    sns.barplot(data=vc, x=col, y='count', ax=ax, color=colors[i], edgecolor='w')
    
    ax.set_title(title.upper(), fontweight='bold')
    ax.set_xlabel("") 
    ax.tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(22, 14))
fig.suptitle('Continuous Data', fontsize=24, fontweight='bold', y=1.02)

groups = [
    (df, ["release_speed", "launch_angle", "home_score", "age_bat"], "df", "Blues_d"),
    (batter_stats, ["Age", "PA", "OPS", "HR"], "batter", "Greens_d"),
    (pitcher_stats, ["Age", "IP", "ERA", "SO"], "pitcher", "Oranges_d")
]

for row, (data, cols, label, palette) in enumerate(groups):
    for col_idx, col in enumerate(cols):
        ax = axes[row, col_idx]
        
        if col in data.columns:
            s = pd.to_numeric(data[col], errors="coerce").dropna()
            
            sns.histplot(s, bins=30, kde=True, ax=ax, color=sns.color_palette(palette)[2], edgecolor='w')
            
            ax.set_title(f"{label.upper()}: {col}", fontsize=14, fontweight='semibold')
            ax.set_xlabel(col, fontsize=12)
            ax.set_ylabel("Frequency", fontsize=12)
     

plt.tight_layout()
plt.show()

spiegare che questo dataset permette di fare anche plot e analisi piu avanzate, per esempio heatmap, spyderplot, ecc sia su batter che su pitcher

#### 4.4.1 Batter Analysis

Come prima cosa possiamo andare ad analizzare i battitori, allinterno del dataset abbiamo che ogni battuta viene categorizzata con single double triple o home_run, andiamo ad analizzare meglio questi eventi prendendo per esempio la velocit√† e l'angolo di battitura.

In [None]:
event_order = ["out", "single", "double", "triple", "home_run"]

df_plot = df.sample(frac=0.3, random_state=42).copy()

df_plot["event_cat"] = np.where(
    df_plot["events"].isin(["single", "double", "triple", "home_run"]),
    df_plot["events"],
    "out"
)

In [None]:
# Color
color_map = {
    "out": "#9aa0a6",
    "single": "#1f77b4",
    "double": "#ff7f0e",
    "triple": "#2ca02c",
    "home_run": "#d62728"
}

# Opacity
opacity = {
    "out": 0.20,
    "single": 0.50,
    "double": 0.65,
    "triple": 0.80,
    "home_run": 1.00
}

In [None]:
# Subplots layout: 2x2 
fig = make_subplots(
    rows=2,
    cols=2,
    specs=[
        [{"colspan": 2}, None],
        [{}, {}]
    ],
    subplot_titles=[
        "Launch Angle vs Launch Speed",
        "Batted Ball Spray Chart",
        "Pitch Location"
    ],
    vertical_spacing=0.08,
    horizontal_spacing=0.08
)

In [None]:
# Launch Angle vs Launch Speed 
for ev in event_order:
    sub = df_plot[df_plot["event_cat"] == ev]
    fig.add_trace(
        go.Scattergl(
            x=sub["launch_speed"],
            y=sub["launch_angle"],
            mode="markers",
            name=ev,
            legendgroup=ev,
            showlegend=True,
            marker=dict(
                size=6,
                color=color_map[ev],
                opacity=opacity[ev]
            ),
            hovertemplate=(
                f"<b>{ev}</b><br>"
                "Launch Speed: %{x}<br>"
                "Launch Angle: %{y}<extra></extra>"
            ),
        ),
        row=1, col=1
    )

# Spray Chart 
for ev in event_order:
    sub = df_plot[df_plot["event_cat"] == ev]
    fig.add_trace(
        go.Scattergl(
            x=sub["hc_x"],
            y=sub["hc_y"],
            mode="markers",
            name=ev,
            legendgroup=ev,
            showlegend=False,
            marker=dict(
                size=5,
                color=color_map[ev],
                opacity=opacity[ev]
            ),
        ),
        row=2, col=1
    )

# Pitch Location 
for ev in event_order:
    sub = df_plot[df_plot["event_cat"] == ev]
    fig.add_trace(
        go.Scattergl(
            x=sub["plate_x"],
            y=sub["plate_z"],
            mode="markers",
            name=ev,
            legendgroup=ev,
            showlegend=False,
            marker=dict(
                size=5,
                color=color_map[ev],
                opacity=opacity[ev]
            ),
        ),
        row=2, col=2
    )


fig.update_layout(
    height=850,
    width=1450,
    title="Batted Ball Profile: Contact Quality, Spray & Pitch Location",
    legend_title_text="Event",
    margin=dict(l=40, r=20, t=70, b=40)
)

fig.update_xaxes(range=[-2.5, 2.5], row=2, col=2)
fig.update_yaxes(range=[-0.5, 5.5], row=2, col=2)

fig.update_xaxes(title_text="Launch Speed", row=1, col=1)
fig.update_yaxes(title_text="Launch Angle", row=1, col=1)

fig.update_xaxes(title_text="Hit Location X", row=2, col=1)
fig.update_yaxes(title_text="Hit Location Y", row=2, col=1)

fig.update_xaxes(title_text="Plate X", row=2, col=2)
fig.update_yaxes(title_text="Plate Z", row=2, col=2)

fig.show()


analizzare il grafico. 

all'interno del dataset abbiamo un indice che potrebbe permetterci di analizzare quanto una battuta e buona non solo basandoci su uno scatterplot, questa variabile √® woba value che rappresenta il valore di una battuta in base a diversi fattori come difesa posizionamento park e rumore.

In [None]:
gif_path = "images/batter.gif"

with open(gif_path, "rb") as f:
    gif_base64 = base64.b64encode(f.read()).decode("ascii")


In [None]:
df_plot = df.dropna(subset=['launch_speed', 'launch_angle', 'woba_value'])

fig = make_subplots(
    rows=1, cols=2,
    column_widths=[0.45, 0.55],
    specs=[[{"type": "xy"}, {"type": "xy"}]]
)

scatter = px.scatter(
    df_plot, x='launch_speed', y='launch_angle',
    color='woba_value',
    color_continuous_scale='RdYlGn_r',  
    range_color=[0, 1.5],
    labels={'woba_value': 'WOBA Value'}
)

for trace in scatter.data:
    trace.marker.size = 4
    trace.marker.opacity = 0.6
    trace.marker.coloraxis = "coloraxis"
    fig.add_trace(trace, row=1, col=2)

fig.add_layout_image(
    dict(
        source=f"data:image/gif;base64,{gif_base64}",
        xref="x domain", 
        yref="y domain",
        x=0.0,
        y=0.5,
        sizex=1, 
        sizey=1,
        xanchor="left",
        yanchor="middle",
        sizing="contain",
        opacity=1.0,
        layer="below"
    ),
    row=1, col=1
)

fig.update_xaxes(visible=False, range=[0, 1], row=1, col=1)
fig.update_yaxes(visible=False, range=[0, 1], row=1, col=1)

fig.update_xaxes(title_text="Exit Velocity (mph)", range=[40, 120], row=1, col=2)
fig.update_yaxes(title_text="Launch Angle (degrees)", range=[-45, 80], row=1, col=2)

fig.add_hline(y=50, line_dash="dot", line_color="#444", row=1, col=2,
              annotation_text="fly balls", annotation_position="top right")
fig.add_hline(y=25, line_dash="dot", line_color="#444", row=1, col=2,
              annotation_text="line drives", annotation_position="top right")
fig.add_hline(y=10, line_dash="dot", line_color="#444", row=1, col=2,
              annotation_text="ground balls", annotation_position="top right")
fig.add_vline(x=95, line_dash="dash", line_color="#444", row=1, col=2,
              annotation_text="Hard Hit (95+ mph)", annotation_position="top left")

fig.update_layout(
    height=750,
    width=1350,
    template="plotly_white",
    coloraxis=dict(
        colorscale='RdYlGn_r', 
        cmin=0,
        cmax=1.5,
        colorbar=dict(title="WOBA", x=1.02, thickness=15)
    ),
    margin=dict(l=10, r=10, t=80, b=40)
)

fig.show()

analizzare il grafico, 

source: https://fivethirtyeight.com/features/the-new-science-of-hitting/

#### 4.4.2 Pitcher Analysis

ora analizziamo i lanciatori, nel baseball esistono diversi tipi di lanci, i quali hanno diverse caratteristiche, in moneyball √® importante tener conto di questo siccome √® un fattore importante all'ora di decidere un battitore o lanciatore da ingaggiare. 

spiegare che esistono diversi tipi di lanci e nel database li abbiamo tutti e li analizziamo raggruppandoli in 4 macrocategorie

<img src="images/types_pitch.png">

come analisi di lanciatori abbiamo metriche come il tipo di lanciamento la velocit√† e molte altre, questo permette fare plot come i seguenti in cui analizziamo il lanciamento di un determinata persona e come varia in base al tipo di lancio

In [None]:
PITCHER = "Skenes, Paul"

p, s, pl = df["pitch_type"], df["release_speed"], df["pitcher_name"]
m = p.notna() & s.notna() & (pl == PITCHER)

cats = {"Fastball":["FF","SI"], "Changeup":["CH"], "Slider":["SL"], "Curve":["CU"]}
used = [c for v in cats.values() for c in v]
m_used = m & p.isin(used)

In [None]:
x = np.linspace(65, 102, 600)
dx = x[1] - x[0]
freq = lambda a: gaussian_kde(a)(x) * len(a) * dx

In [None]:
fig = go.Figure()

all_arr = s[m_used].to_numpy()
if len(all_arr) >= 2:
    fig.add_scatter(x=x, y=freq(all_arr), name="All pitches",
                    line=dict(dash="dash", width=3, color="black"))

for name, codes in cats.items():
    arr = s[m & p.isin(codes)].to_numpy()
    if len(arr) < 2: 
        continue
    fig.add_scatter(x=x, y=freq(arr), name=name, fill="tozeroy")

fig.update_layout(
    title=f"Pitch speed distributions ‚Äî {PITCHER}",
    xaxis_title="Pitch Speed (mph)",
    yaxis_title="Frequency of Speed"
)
fig.show()


come primo punto gi√† possiamo vedere diversi aspetti interessanti il fatto che fastball da come indica il nome √® il lancio pi√π veloce e con una freq of speed maggiore

come prossimo punto possiamo andare ad analizzare come varia la traiettoria in base al tipo di lancio

In [None]:
x_in = df["pfx_x"] * 12
z_in = df["pfx_z"] * 12

fig = go.Figure()

# pitch points
for name, codes in cats.items():
    mm = m & p.isin(codes)
    fig.add_scatter(
        x=-x_in[mm],        
        y=z_in[mm],
        mode="markers",
        name=name,
        marker=dict(size=9, opacity=0.75)
    )

# circles 
t = np.linspace(0, 2*np.pi, 361)
for r in (12, 24):
    fig.add_scatter(
        x=r*np.cos(t), y=r*np.sin(t),
        mode="lines",
        line=dict(dash="dash", width=1, color="gray"),
        showlegend=False
    )

# axes
R = 26
fig.add_shape(type="line", x0=-R, x1=R, y0=0, y1=0, line=dict(width=1, color="gray"))
fig.add_shape(type="line", x0=0, x1=0, y0=-R, y1=R, line=dict(width=1, color="gray"))

# labels
for v in (12, 24):
    fig.add_annotation(x=v, y=0, text=f'{v}"', showarrow=False, font=dict(color="gray"))
    fig.add_annotation(x=0, y=v, text=f'{v}"', showarrow=False, font=dict(color="gray"))

fig.update_layout(
    title=f"{PITCHER} ‚Äî Movement Profile",
    xaxis=dict(title="Moves toward 3B ", range=[-26, 26], zeroline=False),
    yaxis=dict(title="More Rise  / More Drop ", range=[-26, 26], zeroline=False),
    width=700,
    height=700,
    yaxis_scaleanchor="x",
    yaxis_scaleratio=1
)

fig.show()


analizzare

In [None]:
BW = 0.5
pitches = ["FF","SI","SL","CH","CU"]
names  = ["Four-Seam","Sinker","Slider","Change","Curve"]

xx = np.linspace(-2.0, 2.0, 140)
zz = np.linspace(0.0, 5.0, 160)
X, Z = np.meshgrid(xx, zz)
grid = np.vstack([X.ravel(), Z.ravel()])

In [None]:
fig = make_subplots(rows=1, cols=len(pitches), subplot_titles=names, horizontal_spacing=0.04)

for i, pitch in enumerate(pitches, start=1):
    mm = m_used & (df["pitch_type"] == pitch)
    x = df.loc[mm, "plate_x"].to_numpy()
    z = df.loc[mm, "plate_z"].to_numpy()
    ok = np.isfinite(x) & np.isfinite(z)
    x, z = x[ok], z[ok]
    if x.size < 5:
        continue

    D = gaussian_kde(np.vstack([x, z]), bw_method=BW)(grid).reshape(Z.shape)

    fig.add_trace(
        go.Contour(
            x=xx, y=zz, z=D,
            ncontours=10,
            contours=dict(coloring="heatmap", showlines=True),
            showscale=False, hoverinfo="skip"
        ),
        row=1, col=i
    )

    fig.add_shape(type="rect", x0=-0.83, x1=0.83, y0=1.5, y1=3.5,
                  xref=f"x{i}", yref=f"y{i}", line=dict(color="black", width=2))
    fig.add_shape(type="path",
                  path="M -0.71 0.0 L 0.71 0.0 L 0.35 0.35 L 0.0 0.6 L -0.35 0.35 Z",
                  xref=f"x{i}", yref=f"y{i}", line=dict(color="gray", width=1))

    fig.update_xaxes(range=[-2, 2], showgrid=False, zeroline=False, row=1, col=i)
    fig.update_yaxes(range=[0, 5], showgrid=False, zeroline=False, row=1, col=i)

fig.update_layout(title=f"{PITCHER} ‚Äî Location KDE ", width=1500, height=360, margin=dict(t=70))
fig.show()


#### 4.5 Metric Analysis 

spiegare che abbiamo molte tipologie di metriche: come sono composte, alcune sono semplici altre sono derivate da altre metriche

In [None]:
batter_stats["Name"] = (
    batter_stats["Name"]
    .astype(str)
    .apply(lambda x: codecs.decode(x, "unicode_escape"))
    .str.encode("latin1")
    .str.decode("utf-8")
)
batter_stats.head()

In [None]:
batter_stats_filtered = batter_stats[batter_stats['AB'] >= 20].copy()

In [None]:
# Calculate advanced metrics
batter_stats_filtered['ISO'] = batter_stats_filtered['SLG'] - batter_stats_filtered['BA']  # Isolated Power
batter_stats_filtered['BB_Rate'] = (batter_stats_filtered['BB'] / batter_stats_filtered['PA']) * 100  # Walk Rate %
batter_stats_filtered['K_Rate'] = (batter_stats_filtered['SO'] / batter_stats_filtered['PA']) * 100  # Strikeout Rate %
batter_stats_filtered['HR_Rate'] = (batter_stats_filtered['HR'] / batter_stats_filtered['PA']) * 100  # Home Run Rate %

top_iso = batter_stats_filtered.nlargest(10, 'ISO')[['Name', 'Tm', 'BA', 'SLG', 'ISO', 'HR']]

top_bb = batter_stats_filtered.nlargest(10, 'BB_Rate')[['Name', 'Tm', 'BB_Rate', 'BB', 'OBP']]

In [None]:
# Violin plots for key metrics
metrics_to_plot = ['BA', 'OBP', 'SLG', 'OPS']

fig = go.Figure()

for i, metric in enumerate(metrics_to_plot):
    fig.add_trace(go.Violin(
        y=batter_stats_filtered[metric],
        name=metric,
        box_visible=True,
        meanline_visible=True,
        fillcolor=['#3498db', '#2ecc71', '#e74c3c', '#f39c12'][i],
        opacity=0.6,
        x0=metric
    ))

fig.update_layout(
    title='Violin Plots of Key Batting Metrics (2024 Season)',
    yaxis_title='Value',
    xaxis_title='Metric',
    height=600,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    showlegend=False,
    yaxis=dict(gridcolor='#e1e1e1')
)

fig.show()


analizzare, 

In [None]:
# OBP vs SLG scatter (components of OPS)
median_obp = batter_stats_filtered['OBP'].median()
median_slg = batter_stats_filtered['SLG'].median()

# Create categories based on quadrants
def categorize_player(row):
    if row['OBP'] >= median_obp and row['SLG'] >= median_slg:
        return 'Elite (High OBP & SLG)'
    elif row['OBP'] >= median_obp and row['SLG'] < median_slg:
        return 'Contact (High OBP)'
    elif row['OBP'] < median_obp and row['SLG'] >= median_slg:
        return 'Power (High SLG)'
    else:
        return 'Below Average'

batter_stats_filtered['Category'] = batter_stats_filtered.apply(categorize_player, axis=1)

fig = px.scatter(
    batter_stats_filtered,
    x='SLG',
    y='OBP',
    color='Category',
    size='AB',
    hover_data=['Name', 'Tm', 'HR', 'BA', 'OPS'],
    title='Player Categories: OBP vs SLG Quadrant Analysis',
)

# Add median lines
fig.add_hline(y=median_obp, line_dash="dash", line_color="gray", 
              annotation_text=f"Median OBP: {median_obp:.3f}",
              annotation_position="right")
fig.add_vline(x=median_slg, line_dash="dash", line_color="gray",
              annotation_text=f"Median SLG: {median_slg:.3f}",
              annotation_position="top")

fig.update_layout(
    title_font=dict(size=20, color='#2c3e50', family='Arial Black'),
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='Slugging Percentage (SLG)'),
    yaxis=dict(gridcolor='#e1e1e1', title='On-Base Percentage (OBP)'),
    legend=dict(title='Player Category', font=dict(size=11))
)

fig.show()


In [None]:
# üìä AB vs BA - VARIANCE vs SAMPLE SIZE
# Questo mostra perch√© serve il Bayesian: alta varianza con pochi AB
fig = px.scatter(
    batter_stats_filtered,
    x='AB',
    y='BA',
    size='H',
    color='PA',
    hover_data=['Name', 'Tm', 'H', 'PA'],
    color_continuous_scale='Viridis',
    title='Batting Average vs At-Bats: The Shrinkage Problem',
    labels={'AB': 'At-Bats (Sample Size)', 'BA': 'Batting Average'}
)

# Add smoothed trend line
from scipy.ndimage import gaussian_filter1d
ab_sorted = batter_stats_filtered.sort_values('AB')
ab_smooth = gaussian_filter1d(ab_sorted['BA'], sigma=10)

fig.add_trace(go.Scatter(
    x=ab_sorted['AB'],
    y=ab_smooth,
    mode='lines',
    line=dict(color='red', width=3, dash='dash'),
    name='Smoothed Trend',
    showlegend=True
))

fig.update_layout(
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1'),
    yaxis=dict(gridcolor='#e1e1e1'),
    
)

fig.show()

# Study I ‚Äì Moneyball: Team-Level Analysis

## Research Question

Moneyball emphasizes the importance of on-base percentage (OBP) as a measure of a batter's ability to reach base, whether through hits or walks. Historically, batting average (BA) had been the dominant offensive statistic in baseball, rewarding players solely for their ability to hit safely, while ignoring walks. As a result, players with strong plate discipline were systematically undervalued.

The Moneyball philosophy highlighted that reaching base, regardless of how it is achieved, is a fundamental mechanism for generating runs and, ultimately, winning games. In this sense, OBP represents a shift from traditional descriptive to statistics that better capture the underlying processes that determine team success. This insight motivates a formal comparison between OBP and BA in terms of their association with team-level performance outcomes. 
For this reason , our research question is:
**"Is on-base percentage (OBP) a better predictor of team success (measured by runs per game) than batting average (BA) in Major League Baseball?"**

## 1. Bayesian hypothesis testing

### 1.1. H: On-Base Percentage has a larger positive effect on runs per game than Batting Average

The main hypothesis tested in this work is that On-Base Percentagehas a stronger association with team offensive performance, measured by runs per game, which is the primary driver of team wins, than Batting Average. This hypothesis is formalized by comparing the posterior distributions of the corresponding regression coefficients. 

$$ H: \beta_{OBP} > \beta_{BA} $$

where $\beta_{OBP}$ and $\beta_{BA}$ are the regression coefficients for OBP and BA, respectively. We want to asses the probability of this hypothesis being true given the observed data.

$$ P(H | Data) = P(\beta_{OBP} > \beta_{BA} | Data) $$

##### 1.1.1 Priors and Likelihood

we focus on the posterior distribution of the difference between the two coefficient:
$$ \delta = \beta_{OBP} - \beta_{BA} $$
and evaluate the posterior probability that $P(\delta > 0 | Data)$.

the **likelihood function** will be modeled as: $$ y_i \sim Normal(\mu, \sigma) $$ $$ \mu_i = \beta_0 + \beta_{OBP} \cdot OBP_i + \beta_{BA} \cdot BA_i + \epsilon_i$$ where $y_i$ is the runs per game for team $i$

Historically, looking at MLB data, we can see that on average a team scores 4 to 5 points per game, so we consider an average of 4.5 with $\sigma = 0.5$ to include data between 3.5 and 5.5 with approximately 95% probability. As for the priors for $\beta_ {OBP}$ and $\beta_{BA}$, we assume that they are distributions centered at 0 with $\sigma_{\beta} = 1$, positive and negative effects are equally possible a priori.

$$ \beta_0 \sim Normal (4.5, 0.5) $$
$$ \beta_{OBP} \sim Normal(0, 1) $$
$$ \beta_{BA} \sim Normal(0, 1) $$
$$ \epsilon \sim HalfNormal(1) $$

##### 1.1.2 Probabilistic Model

OBP and BA were standardized prior to model fitting to place regression coefficients on a common scale.

In [None]:
y = team_batter_stats["R/G"].values
team_batter_stats["z_OBP"] = (
    team_batter_stats["OBP"] - team_batter_stats["OBP"].mean()
) / team_batter_stats["OBP"].std()

team_batter_stats["z_BA"] = (
    team_batter_stats["BA"] - team_batter_stats["BA"].mean()
) / team_batter_stats["BA"].std()
x_obp = team_batter_stats["z_OBP"].values
x_ba  = team_batter_stats["z_BA"].values


In [None]:
with pm.Model() as moneyball_model:

    # Priors
    beta_0    = pm.Normal('beta_0', mu=4.4, sigma=0.5)
    beta_obp  = pm.Normal('beta_OBP', mu=0, sigma=1)
    beta_ba   = pm.Normal('beta_BA', mu=0, sigma=1)
    sigma     = pm.HalfNormal('sigma', sigma=0.5)

    # Linear predictor 
    mu = beta_0 + beta_obp * x_obp + beta_ba * x_ba

    # Likelihood 
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)

    # Deterministic
    delta = pm.Deterministic('delta', beta_obp - beta_ba)


In [None]:
with moneyball_model:
    normal_trace   = pm.sample(chains=4, cores=10)

##### 1.1.3 Inference

In [None]:
az.plot_trace(normal_trace);

In [None]:
az.summary(normal_trace, hdi_prob=0.95)

In [None]:
az.plot_posterior(normal_trace,  var_names="delta", ref_val=0, hdi_prob=0.95);

In [None]:
az.plot_posterior(normal_trace,  var_names="delta", rope=[-0.05,0.05]);

##### 1.1.4 Prior Sensitivity Analysis

Now let's assume that before seeing the data, even large differences between teams in OBP are unable to explain more or fewer runs per game.

In [None]:
with pm.Model() as model_prior_tight:

    beta_0   = pm.Normal('beta_0', mu=4.4, sigma=0.5)
    beta_obp = pm.Normal('beta_OBP', mu=0, sigma=0.5)
    beta_ba  = pm.Normal('beta_BA', mu=0, sigma=0.5)
    sigma    = pm.HalfNormal('sigma', sigma=0.5)
    

    mu = beta_0 + beta_obp * x_obp + beta_ba * x_ba

    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)

    delta = pm.Deterministic('delta', beta_obp - beta_ba)


In [None]:
with model_prior_tight:
    trace_tight = pm.sample(chains=4, cores=10)

In [None]:
az.plot_trace(trace_tight);

In [None]:
az.summary(trace_tight, hdi_prob=0.95)


In [None]:
az.plot_posterior(trace_tight,  var_names="delta", ref_val=0, hdi_prob=0.95)

##### 1.1.5 Frequentist Comparison

In [None]:

X = np.column_stack([x_obp, x_ba])
X = sm.add_constant(X)  # intercetta
y = y

ols_model = sm.OLS(y, X).fit()
print(ols_model.summary())

In [None]:
hypothesis = np.array([0, 1, -1])  # [Œ≤0, Œ≤_OBP, Œ≤_BA]
t_test = ols_model.t_test(hypothesis)

print(t_test)


##### 1.1.6 Posterior Predictive Checks

In [None]:
beta0_samples   = normal_trace.posterior["beta_0"].values.flatten()
beta_obp_samples = normal_trace.posterior["beta_OBP"].values.flatten()
beta_ba_samples  = normal_trace.posterior["beta_BA"].values.flatten()
sigma_samples    = normal_trace.posterior["sigma"].values.flatten()


In [None]:
n_draws = 500
idx = np.random.choice(len(beta0_samples), n_draws, replace=False)

y_rep = []

for i in idx:
    mu = beta0_samples[i] + beta_obp_samples[i] * x_obp + beta_ba_samples[i] * x_ba
    y_sim = np.random.normal(mu, sigma_samples[i])
    y_rep.append(y_sim)

y_rep = np.array(y_rep)


In [None]:
n_ppc_plot = 50
idx_plot = np.random.choice(y_rep.shape[0], n_ppc_plot, replace=False)

fig = go.Figure()

for i in idx_plot:
    fig.add_trace(go.Scatter(
        x=y,
        y=y_rep[i],
        mode="markers",
        marker=dict(size=5, opacity=0.7, color="lightblue"),
        showlegend=False
    ))

fig.add_trace(go.Scatter(
    x=y,
    y=y_rep.mean(axis=0),
    mode="markers",
    marker=dict(size=10, opacity=0.7),
    name="Posterior Predictive Mean"
))

fig.add_trace(go.Scatter(
    x=[y.min(), y.max()],
    y=[y.min(), y.max()],
    mode="lines",
    line=dict(dash="dash", color="black"),
    name="Perfect fit"
))

fig.update_layout(
    title="Posterior Predictive Check: Observed vs Predicted",
    xaxis_title="Observed Runs per Game",
    yaxis_title="Posterior Predictive Mean Runs per Game"
)

fig.show()


## 2. Regression Models

## 3. Hirerchical vs unpooled models

## 4. Model Comparison (WAIC and LOO)

# Study II - Are modern Hitter Categories Meaningful?

## Research Question

Baseball hitters have historically been evaluated using simple statistics, such as batting average, that capture hit frequency but fail to fully describe offensive value. With the rise of sabermetrics, greater emphasis has been placed on On-Base Percentage (OBP) and Slugging Percentage (SLG), which represent two fundamental offensive skills: the ability to reach base and the ability to generate power.

In practice, hitters are often informally classified using combinations of OBP and SLG. As illustrated in the OBP‚ÄìSLG quadrant analysis, this results in four common archetypes: Elite, Contact, Power, and Below-average hitters. Despite their widespread use in baseball discourse, it remains an open empirical question whether these categories reflect meaningful differences in offensive production.

To assess offensive value across these groups, this study employs weighted On-Base Average (wOBA), a run-based metric that assigns empirical weights to offensive events and captures overall run contribution.

The research question guiding this analysis is:
**Can we classify hitters based on OBP and SLG into distinct categories that meaningfully differ in offensive value as measured by wOBA?**

## 1. Bayesian hypothesis testing

### 1.1. H: On-Base Percentage has a larger positive effect on runs per game than Batting Average

The main hypothesis tested in this study is that hitters classified into OBP‚ÄìSLG-based categories differ in their average offensive production, as measured by weighted On-Base Average (wOBA).

Let $\mu_g$ denote the mean wOBA of hitters belonging to group $g \in \{\text{Elite}, \text{Contact}, \text{Power}, \text{Below}\}$.

The hypothesis of interest is:

$$
H:\ \mu_{\text{Elite}},\ \mu_{\text{Contact}},\ \mu_{\text{Power}},\ \mu_{\text{Below}} \ \text{are not all equal}
$$

Rather than testing a single point null hypothesis, the Bayesian framework allows us to directly assess the probability of differences between group means. In particular, we focus on pairwise contrasts of the form:

$$
\delta_{g,g'} = \mu_g - \mu_{g'}
$$

and evaluate posterior probabilities such as:

$$
P(\delta_{g,g'} > 0 \mid \text{Data})
$$

with particular attention to comparisons involving Elite hitters versus the other groups.

##### 1.1.1 Priors and Likelihood

We model offensive production using group-level mean wOBA values for hitters classified into OBP‚ÄìSLG-based categories. For each group 
$g \in \{\text{Elite}, \text{Contact}, \text{Power}, \text{Below}\}$, let $\mu_g$ denote the mean wOBA.

To assess whether these categories correspond to meaningful differences in offensive value, we examine posterior contrasts between group means:

$$
\delta_{g,g'} = \mu_g - \mu_{g'}
$$

and evaluate probabilities of the form:

$$
P(\delta_{g,g'} > 0 \mid \text{Data})
$$

The likelihood is specified as:

$$
y_i \sim \text{Normal}(\mu_{g(i)}, \sigma)
$$

where $y_i$ is the average wOBA of hitter $i$.

Given that league-average wOBA is approximately 0.31, we assign exchangeable, weakly informative priors to the group means: **controllare la mean annuale e poi mettere HDI 90% attorno alla mean e 10% per valori estremi**

$$
\mu_g \sim \text{Normal}(0.31,\ 0.050)
$$

and model residual variability as:

$$
\sigma \sim \text{HalfNormal}(0.050).
$$

##### 1.1.2 Data preparation

In [None]:
df_model = df.merge(
    batter_stats[["mlbID", "OBP", "SLG"]],
    left_on="batter",
    right_on="mlbID",
    how="left"
)

In [None]:
df_model["hitter_group"] = np.select(
    [
        (df_model["OBP"] >= 0.302) & (df_model["SLG"] >= 0.373),
        (df_model["OBP"] >= 0.302) & (df_model["SLG"] <  0.373),
        (df_model["OBP"] <  0.302) & (df_model["SLG"] >= 0.373),
        (df_model["OBP"] <  0.302) & (df_model["SLG"] <  0.373),
    ],
    ["Elite", "Contact", "Power", "Below"],
    default="Unknown"
)
df_model

In [None]:
d = df_model.dropna(subset=["woba_value", "batter"])
d = d[d["hitter_group"] != "Unknown"]

batter_df = (
    d.groupby(["batter_name", "hitter_group"], as_index=False)
     .agg(
         woba_mean=("woba_value", "mean"),
         n=("woba_value", "size")
     )
)
batter_df.head(2)

In [None]:
group_order = ["Elite", "Contact", "Power", "Below"]
batter_df["group_idx"] = batter_df["hitter_group"].map({g:i for i,g in enumerate(group_order)})

y = batter_df["woba_mean"].to_numpy()
group_idx = batter_df["group_idx"].astype("int64").to_numpy()
G = len(group_order)

##### 1.1.2 Probabilistic Model

In [None]:
with pm.Model() as model:
    mu_g = pm.Normal("mu_g", mu=0.320, sigma=0.050, shape=G)
    sigma = pm.HalfNormal("sigma", sigma=0.050)

    mu = mu_g[group_idx]
    y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y)

    # Contrasts (opzionali)
    d_elite_below   = pm.Deterministic("d_elite_below",   mu_g[0] - mu_g[3])
    d_elite_contact = pm.Deterministic("d_elite_contact", mu_g[0] - mu_g[1])
    d_elite_power   = pm.Deterministic("d_elite_power",   mu_g[0] - mu_g[2])

In [None]:
with model:
    trace = pm.sample(chains=4, cores=10)

##### 1.1.3 Inference

In [None]:
az.plot_trace(trace);

In [None]:
az.summary(trace, var_names=["mu_g", "sigma", "d_elite_below", "d_elite_contact", "d_elite_power"],hdi_prob=0.95)


In [None]:
az.plot_posterior(trace,var_names=["d_elite_below"],ref_val=0,hdi_prob=0.95);

In [None]:
az.plot_posterior(trace,var_names=["d_elite_below"],rope=[-0.010, 0.010]);
az.plot_posterior(trace,var_names=["d_elite_contact"],rope=[-0.010, 0.010]);
az.plot_posterior(trace,var_names=["d_elite_power"],rope=[-0.010, 0.010]);

##### 1.1.4 Prior Sensitivity Analysis

In [None]:
with pm.Model() as model_prior_tight:

    mu_g = pm.Normal("mu_g", mu=0.320, sigma=0.020, shape=G)
    sigma = pm.HalfNormal("sigma", sigma=0.050)

    mu = mu_g[group_idx]
    y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y)

    # Contrasts
    d_elite_below   = pm.Deterministic("d_elite_below",   mu_g[0] - mu_g[3])
    d_elite_contact = pm.Deterministic("d_elite_contact", mu_g[0] - mu_g[1])
    d_elite_power   = pm.Deterministic("d_elite_power",   mu_g[0] - mu_g[2])

In [None]:
with model_prior_tight:
    trace = pm.sample(chains=4, cores=10)

In [None]:
az.summary(trace,var_names=["mu_g", "sigma", "d_elite_below", "d_elite_contact", "d_elite_power"],hdi_prob=0.95)

In [None]:
az.plot_posterior(trace,var_names=["d_elite_below"],rope=[-0.010, 0.010]);
az.plot_posterior(trace,var_names=["d_elite_contact"],rope=[-0.010, 0.010]);
az.plot_posterior(trace,var_names=["d_elite_power"],rope=[-0.010, 0.010]);

##### 1.1.5 Frequentist Comparison

In [None]:
anova_model = smf.ols("woba_mean ~ C(hitter_group)", data=batter_df).fit()
print(anova_model.summary())

In [None]:
anova_table = sm.stats.anova_lm(anova_model, typ=2)
print(anova_table)

##### 1.1.6 Posterior Predictive Checks

In [None]:
mu_g_samples = trace.posterior["mu_g"].values.reshape(-1, G)
sigma_samples = trace.posterior["sigma"].values.flatten()

# pick draws
n_draws = 300
idx = np.random.choice(mu_g_samples.shape[0], n_draws, replace=False)

groups = ["Elite", "Contact", "Power", "Below"]

fig = go.Figure()

for g, name in enumerate(groups):
    y_obs_g = y[group_idx == g]

    # simulate group draws
    y_sim_g = []
    for i in idx:
        y_sim = np.random.normal(mu_g_samples[i, g], sigma_samples[i], size=len(y_obs_g))
        y_sim_g.append(y_sim.mean())  # compare means (could also compare full distribution)

    # observed mean marker
    fig.add_trace(go.Scatter(
        x=[name],
        y=[y_obs_g.mean()],
        mode="markers",
        marker=dict(size=12),
        name=f"Observed mean ({name})"
    ))

    # simulated means distribution (as points)
    fig.add_trace(go.Box(
        x=[name]*len(y_sim_g),
        y=y_sim_g,
        name=f"Posterior predictive means ({name})",
        boxpoints=False,
        showlegend=False
    ))

fig.update_layout(
    title="Posterior Predictive Check by Group: Observed vs Simulated Mean wOBA",
    xaxis_title="Hitter Group",
    yaxis_title="Mean wOBA"
)

fig.show()


In [None]:
import numpy as np
import plotly.graph_objects as go

# posterior samples
mu_g_samples = trace.posterior["mu_g"].values.reshape(-1, G)     # (draws, G)
sigma_samples = trace.posterior["sigma"].values.flatten()        # (draws,)

n_draws = 500
idx = np.random.choice(mu_g_samples.shape[0], n_draws, replace=False)

y_rep = []
y_hat = []

for i in idx:
    mu_i = mu_g_samples[i, group_idx]          # mean for each observation based on its group
    y_sim = np.random.normal(mu_i, sigma_samples[i])
    y_rep.append(y_sim)
    y_hat.append(mu_i)

y_rep = np.array(y_rep)
y_hat = np.array(y_hat)

# plot: observed vs posterior predictive mean
n_ppc_plot = 50
idx_plot = np.random.choice(y_rep.shape[0], n_ppc_plot, replace=False)

fig = go.Figure()

for i in idx_plot:
    fig.add_trace(go.Scatter(
        x=y,
        y=y_rep[i],
        mode="markers",
        marker=dict(size=5, opacity=0.35),
        showlegend=False
    ))

fig.add_trace(go.Scatter(
    x=y,
    y=y_rep.mean(axis=0),
    mode="markers",
    marker=dict(size=9, opacity=0.8),
    name="Posterior Predictive Mean"
))

fig.add_trace(go.Scatter(
    x=[y.min(), y.max()],
    y=[y.min(), y.max()],
    mode="lines",
    line=dict(dash="dash"),
    name="Perfect fit"
))

fig.update_layout(
    title="Posterior Predictive Check: Observed vs Predicted (wOBA)",
    xaxis_title="Observed Mean wOBA (per hitter)",
    yaxis_title="Simulated Mean wOBA from Posterior"
)

fig.show()


## 2. Regression Models

## 3. Hirerchical vs unpooled models

## 4. Model Comparison (WAIC and LOO)