# A Bayesian Analysis of Team Performance Metrics in Major League Baseball

by Mecchia Alessandro and Sergio Fernandez Diz

## Introduction

This work uses Bayesian methods to analyze the relationship between batting metrics and team success in Major League Baseball, with particular attention to metrics emphasized by the Moneyball philosophy

In [1]:
import pybaseball as pb
from pybaseball import statcast
import pandas as pd
from pybaseball import cache
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [2]:
cache.enable()

In [3]:
df = statcast(start_dt='2024-10-01', end_dt='2025-04-30')

This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 93/93 [00:03<00:00, 24.54it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


In [4]:
# priors: usiamo le funzioni pitcher_stats e batting_stats per ottenere i dati delle prestazioni dei lanciatori e dei battitori
# le usiamo pure per EDA. poi utilizzeremo statcast per ottenere dati più dettagliati sulle partite specifiche
batter_stats = pb.batting_stats_bref(2024)
pitcher_stats = pb.pitching_stats_bref(2024)

In [37]:
# Info complete su tutte le colonne
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191443 entries, 2729 to 1040
Columns: 118 entries, pitch_type to intercept_ball_minus_batter_pos_y_inches
dtypes: Float64(42), Int64(59), datetime64[ns](1), object(16)
memory usage: 192.3+ MB


In [35]:
#filtra tutte le partite del player Matt Svanson
svanson_games = df[(df['player_name'] == 'Svanson, Matt')]
svanson_games.head(3)

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
2729,SI,2025-04-30,96.3,-1.46,5.63,"Svanson, Matt",668715,694335,field_out,hit_into_play,...,0,1.95,1.33,1.33,31.2,13.333303,-1.096977,23.937509,36.079394,28.806127
2786,ST,2025-04-30,86.8,-1.93,5.69,"Svanson, Matt",668715,694335,,ball,...,0,2.82,-0.73,-0.73,30.1,,,,,
2915,SI,2025-04-30,96.2,-1.64,5.7,"Svanson, Matt",668715,694335,,called_strike,...,0,1.88,1.33,1.33,29.3,,,,,


## Data and EDA

### batting metrics

In [5]:
import codecs
batter_stats["Name"] = (
    batter_stats["Name"]
    .astype(str)
    .apply(lambda x: codecs.decode(x, "unicode_escape"))
    .str.encode("latin1")
    .str.decode("utf-8")
)
batter_stats.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,mlbID
1,CJ Abrams,23,474,Maj-NL,Washington,138,602,541,79,133,...,1,4,4,31,12,0.246,0.314,0.433,0.747,682928
2,José Abreu,37,574,Maj-AL,Houston,35,120,113,10,14,...,0,1,4,0,0,0.124,0.167,0.195,0.361,547989
3,Wilyer Abreu,25,465,Maj-AL,Boston,132,446,398,59,101,...,0,5,7,8,3,0.254,0.323,0.46,0.783,677800
4,Luisangel Acuña,22,451,Maj-NL,New York,17,43,42,6,12,...,0,0,1,0,1,0.286,0.302,0.595,0.898,682668
5,Ronald Acuña Jr.,26,591,Maj-NL,Atlanta,49,222,192,38,48,...,0,0,4,16,3,0.25,0.351,0.365,0.716,660670


In [6]:
batter_stats.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'PA', 'AB', 'R', 'H', '2B',
       '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB',
       'CS', 'BA', 'OBP', 'SLG', 'OPS', 'mlbID'],
      dtype='object')

In [10]:
batter_stats_filtered = batter_stats[batter_stats['AB'] >= 20].copy()

In [12]:
# Top 10 OPS players
top_ops = batter_stats_filtered.nlargest(10, 'OPS')[['Name', 'Tm', 'AB', 'BA', 'OBP', 'SLG', 'OPS', 'HR', 'RBI']]

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    name='OBP',
    x=top_ops['Name'],
    y=top_ops['OBP'],
    marker_color='#3498db',
    text=top_ops['OBP'].round(3),
    textposition='inside',
    hovertemplate='<b>%{x}</b><br>OBP: %{y:.3f}<extra></extra>'
))

fig.add_trace(go.Bar(
    name='SLG',
    x=top_ops['Name'],
    y=top_ops['SLG'],
    marker_color='#e74c3c',
    text=top_ops['SLG'].round(3),
    textposition='inside',
    hovertemplate='<b>%{x}</b><br>SLG: %{y:.3f}<extra></extra>'
))

fig.update_layout(
    title='Top 10 Players by OPS (2024 Season)',
    xaxis_title='Player',
    yaxis_title='Value',
    barmode='group',
    height=600,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(tickangle=-45),
    yaxis=dict(gridcolor='#e1e1e1'),
    legend=dict(title='Metric', font=dict(size=12))
)

fig.show()


In [13]:
# Violin plots for key metrics
metrics_to_plot = ['BA', 'OBP', 'SLG', 'OPS']

fig = go.Figure()

for i, metric in enumerate(metrics_to_plot):
    fig.add_trace(go.Violin(
        y=batter_stats_filtered[metric],
        name=metric,
        box_visible=True,
        meanline_visible=True,
        fillcolor=['#3498db', '#2ecc71', '#e74c3c', '#f39c12'][i],
        opacity=0.6,
        x0=metric
    ))

fig.update_layout(
    title='Violin Plots of Key Batting Metrics (2024 Season)',
    yaxis_title='Value',
    xaxis_title='Metric',
    height=600,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    showlegend=False,
    yaxis=dict(gridcolor='#e1e1e1')
)

fig.show()


In [14]:
# OBP vs SLG scatter (components of OPS)
median_obp = batter_stats_filtered['OBP'].median()
median_slg = batter_stats_filtered['SLG'].median()

# Create categories based on quadrants
def categorize_player(row):
    if row['OBP'] >= median_obp and row['SLG'] >= median_slg:
        return 'Elite (High OBP & SLG)'
    elif row['OBP'] >= median_obp and row['SLG'] < median_slg:
        return 'Contact (High OBP)'
    elif row['OBP'] < median_obp and row['SLG'] >= median_slg:
        return 'Power (High SLG)'
    else:
        return 'Below Average'

batter_stats_filtered['Category'] = batter_stats_filtered.apply(categorize_player, axis=1)

fig = px.scatter(
    batter_stats_filtered,
    x='SLG',
    y='OBP',
    color='Category',
    size='AB',
    hover_data=['Name', 'Tm', 'HR', 'BA', 'OPS'],
    title='Player Categories: OBP vs SLG Quadrant Analysis',
)

# Add median lines
fig.add_hline(y=median_obp, line_dash="dash", line_color="gray", 
              annotation_text=f"Median OBP: {median_obp:.3f}",
              annotation_position="right")
fig.add_vline(x=median_slg, line_dash="dash", line_color="gray",
              annotation_text=f"Median SLG: {median_slg:.3f}",
              annotation_position="top")

fig.update_layout(
    title_font=dict(size=20, color='#2c3e50', family='Arial Black'),
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='Slugging Percentage (SLG)'),
    yaxis=dict(gridcolor='#e1e1e1', title='On-Base Percentage (OBP)'),
    legend=dict(title='Player Category', font=dict(size=11))
)

fig.show()


In [15]:
from scipy.stats import linregress
# HR vs SO scatter
fig = px.scatter(
    batter_stats_filtered,
    x='SO',
    y='HR',
    size='AB',
    color='SLG',
    hover_data=['Name', 'Tm', 'BA', 'OPS'],
    color_continuous_scale='Plasma',
    title='Home Runs vs Strikeouts - The Power Hitter Trade-off'
)

# Add trend line
mask = batter_stats_filtered['SO'].notna() & batter_stats_filtered['HR'].notna()
slope_hr, intercept_hr, r_hr, _, _ = linregress(
    batter_stats_filtered[mask]['SO'], 
    batter_stats_filtered[mask]['HR']
)

x_hr_reg = np.array([batter_stats_filtered['SO'].min(), batter_stats_filtered['SO'].max()])
y_hr_reg = slope_hr * x_hr_reg + intercept_hr

fig.add_trace(go.Scatter(
    x=x_hr_reg,
    y=y_hr_reg,
    mode='lines',
    line=dict(color='red', width=3, dash='dash'),
    hoverinfo='skip',
    showlegend=False
))

fig.update_layout(
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='Strikeouts (SO)'),
    yaxis=dict(gridcolor='#e1e1e1', title='Home Runs (HR)'),
    coloraxis_colorbar=dict(title="SLG")
)

fig.show()


In [16]:
# OBP vs BA scatter plot
fig = px.scatter(
    batter_stats_filtered,
    x='BA',
    y='OBP',
    size='AB',
    color='OPS',
    hover_data=['Name', 'Tm'],
    color_continuous_scale='Viridis',
    title='On-Base Percentage vs Batting Average (sized by At-Bats, colored by OPS)'
)

# Add diagonal reference line (OBP = BA)
ba_range = [batter_stats_filtered['BA'].min(), batter_stats_filtered['BA'].max()]
fig.add_trace(go.Scatter(
    x=ba_range,
    y=ba_range,
    mode='lines',
    line=dict(color='red', dash='dash', width=2),
    name='OBP = BA',
    showlegend=False,
    hoverinfo='skip'
))

mask = batter_stats_filtered['BA'].notna() & batter_stats_filtered['OBP'].notna()
slope, intercept, r_value, p_value, std_err = linregress(
    batter_stats_filtered[mask]['BA'], 
    batter_stats_filtered[mask]['OBP']
)

x_reg = np.array([batter_stats_filtered['BA'].min(), batter_stats_filtered['BA'].max()])
y_reg = slope * x_reg + intercept

fig.add_trace(go.Scatter(
    x=x_reg,
    y=y_reg,
    mode='lines',
    line=dict(color='blue', width=3),
    showlegend=False,
    hoverinfo='skip'
))

fig.update_layout(
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='Batting Average (BA)'),
    yaxis=dict(gridcolor='#e1e1e1', title='On-Base Percentage (OBP)'),
    coloraxis_colorbar=dict(title="OPS")
)

fig.show()

In [17]:
# Select key metrics for correlation analysis
corr_metrics = ['AB', 'H', 'HR', 'RBI', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS']
corr_data = batter_stats_filtered[corr_metrics].corr()

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=corr_data.values,
    x=corr_data.columns,
    y=corr_data.columns,
    colorscale='RdBu_r',
    zmid=0,
    text=corr_data.values.round(2),
    texttemplate='%{text}',
    textfont={"size": 10},
    colorbar=dict(title="Correlation"),
    hovertemplate='%{x} vs %{y}<br>Correlation: %{z:.3f}<extra></extra>'
))

fig.update_layout(
    title='Correlation Heatmap of Key Batting Metrics (2024 Season)',
    width=900,
    height=800,
    xaxis=dict(side='bottom'),
    yaxis=dict(autorange='reversed'),
    plot_bgcolor='white',
    paper_bgcolor='white'
)

fig.show()


In [18]:
# Calculate advanced metrics
batter_stats_filtered['ISO'] = batter_stats_filtered['SLG'] - batter_stats_filtered['BA']  # Isolated Power
batter_stats_filtered['BB_Rate'] = (batter_stats_filtered['BB'] / batter_stats_filtered['PA']) * 100  # Walk Rate %
batter_stats_filtered['K_Rate'] = (batter_stats_filtered['SO'] / batter_stats_filtered['PA']) * 100  # Strikeout Rate %
batter_stats_filtered['HR_Rate'] = (batter_stats_filtered['HR'] / batter_stats_filtered['PA']) * 100  # Home Run Rate %

top_iso = batter_stats_filtered.nlargest(10, 'ISO')[['Name', 'Tm', 'BA', 'SLG', 'ISO', 'HR']]

top_bb = batter_stats_filtered.nlargest(10, 'BB_Rate')[['Name', 'Tm', 'BB_Rate', 'BB', 'OBP']]

In [19]:
# Aggregate statistics by team
team_stats = batter_stats_filtered.groupby('Tm').agg({
    'BA': 'mean',
    'OBP': 'mean',
    'SLG': 'mean',
    'OPS': 'mean',
    'HR': 'sum',
    'RBI': 'sum',
    'BB': 'sum',
    'SO': 'sum',
    'AB': 'sum'
}).reset_index()

# Rename columns for clarity
team_stats.columns = ['Tm', 'Avg_BA', 'Avg_OBP', 'Avg_SLG', 'Avg_OPS', 
                      'Total_HR', 'Total_RBI', 'Total_BB', 'Total_SO', 'Total_AB']

# Sort by average OPS
team_stats = team_stats.sort_values('Avg_OPS', ascending=False)

team_stats.head(10)

Unnamed: 0,Tm,Avg_BA,Avg_OBP,Avg_SLG,Avg_OPS,Total_HR,Total_RBI,Total_BB,Total_SO,Total_AB
75,"Seattle,Washington",0.307,0.381,0.433,0.814,4,28,21,53,254
65,"New York,Washington",0.257,0.37,0.416,0.786,15,62,70,110,452
10,"Atlanta,San Francisco",0.242,0.339,0.446,0.785,22,65,69,143,500
56,"Miami,New York",0.25,0.317,0.425,0.742,26,75,57,166,617
67,Philadelphia,0.25375,0.317937,0.423687,0.741437,196,733,501,1318,5303
5,"Atlanta,Cleveland",0.252,0.303,0.425,0.728,11,33,15,100,294
59,"Miami,San Diego",0.31,0.341,0.385,0.726,4,46,24,30,668
0,Arizona,0.246722,0.321167,0.404444,0.725556,206,818,541,1222,5327
2,"Arizona,Miami",0.249,0.319,0.405,0.725,19,71,51,120,538
73,"Seattle,Tampa Bay",0.219,0.332,0.388,0.72,20,60,73,169,549


In [20]:
# 4D Bubble chart: OBP vs SLG, size=HR, color=BB_Rate
fig = px.scatter(
    batter_stats_filtered,
    x='OBP',
    y='SLG',
    size='HR',
    color='BB_Rate',
    hover_data=['Name', 'Tm', 'BA', 'OPS'],
    color_continuous_scale='Viridis',
    title=' OBP vs SLG',
    size_max=60
)

fig.update_layout(
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='On-Base Percentage (OBP)'),
    yaxis=dict(gridcolor='#e1e1e1', title='Slugging Percentage (SLG)'),
    coloraxis_colorbar=dict(title="BB Rate %")
)

fig.show()

In [21]:
# Create radar chart for top 5 OPS players
top5_players = batter_stats_filtered.nlargest(5, 'OPS')

scaler_radar = MinMaxScaler()

radar_metrics = ['BA', 'OBP', 'SLG', 'ISO', 'BB_Rate', 'HR_Rate']
top5_scaled = top5_players.copy()
top5_scaled[radar_metrics] = scaler_radar.fit_transform(top5_players[radar_metrics])

fig = go.Figure()

colors_radar = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6']

for idx, (_, player) in enumerate(top5_scaled.iterrows()):
    fig.add_trace(go.Scatterpolar(
        r=player[radar_metrics].values,
        theta=radar_metrics,
        fill='toself',
        name=player['Name'],
        line_color=colors_radar[idx],
        opacity=0.6
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1]
        ),
        bgcolor='#f8f9fa'
    ),
    title='Top 5 Players Radar Chart',
    height=700,
    showlegend=True,
    paper_bgcolor='white',
    legend=dict(font=dict(size=12))
)

fig.show()

In [22]:
# ISO vs BA scatter
fig = px.scatter(
    batter_stats_filtered,
    x='BA',
    y='ISO',
    size='HR',
    color='SLG',
    hover_data=['Name', 'Tm', 'OPS'],
    color_continuous_scale='Hot',
    title='Isolated Power (ISO) vs Batting Average - Power Analysis'
)

fig.update_layout(
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='Batting Average (BA)'),
    yaxis=dict(gridcolor='#e1e1e1', title='Isolated Power (ISO = SLG - BA)'),
    coloraxis_colorbar=dict(title="SLG")
)

fig.show()


In [23]:
# BB% vs K% scatter plot
fig = px.scatter(
    batter_stats_filtered,
    x='K_Rate',
    y='BB_Rate',
    size='AB',
    color='OPS',
    hover_data=['Name', 'Tm'],
    color_continuous_scale='Spectral_r',
    title='Plate Discipline: Walk Rate vs Strikeout Rate'
)

# Add quadrant lines
median_k = batter_stats_filtered['K_Rate'].median()
median_bb = batter_stats_filtered['BB_Rate'].median()

fig.add_hline(y=median_bb, line_dash="dash", line_color="gray", 
              annotation_text=f"Median BB%: {median_bb:.1f}%")
fig.add_vline(x=median_k, line_dash="dash", line_color="gray",
              annotation_text=f"Median K%: {median_k:.1f}%")

fig.update_layout(
    title_font=dict(size=20, color='#2c3e50', family='Arial Black'),
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='Strikeout Rate (%)'),
    yaxis=dict(gridcolor='#e1e1e1', title='Walk Rate (%)'),
    coloraxis_colorbar=dict(title="OPS")
)

fig.show()


In [39]:

# Distribution of At-Bats (AB) and Plate Appearances (PA)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Distribution of At-Bats (AB)', 'Distribution of Plate Appearances (PA)')
)

fig.add_trace(
    go.Histogram(
        x=batter_stats_filtered['AB'],
        marker_color='#3498db',
        nbinsx=50,
        name='AB',
        showlegend=False
    ),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(
        x=batter_stats_filtered['PA'],
        marker_color='#e74c3c',
        nbinsx=50,
        name='PA',
        showlegend=False
    ),
    row=1, col=2
)

fig.update_layout(
    title='Exposure Distribution: Sample Size Variability',
    height=500,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    showlegend=False
)

fig.update_xaxes(title_text='At-Bats (AB)', row=1, col=1, gridcolor='#e1e1e1')
fig.update_xaxes(title_text='Plate Appearances (PA)', row=1, col=2, gridcolor='#e1e1e1')
fig.update_yaxes(title_text='Count', gridcolor='#e1e1e1')

fig.show()

In [42]:
# 📊 AB vs BA - VARIANCE vs SAMPLE SIZE
# Questo mostra perché serve il Bayesian: alta varianza con pochi AB
fig = px.scatter(
    batter_stats_filtered,
    x='AB',
    y='BA',
    size='H',
    color='PA',
    hover_data=['Name', 'Tm', 'H', 'PA'],
    color_continuous_scale='Viridis',
    title='Batting Average vs At-Bats: The Shrinkage Problem',
    labels={'AB': 'At-Bats (Sample Size)', 'BA': 'Batting Average'}
)

# Add smoothed trend line
from scipy.ndimage import gaussian_filter1d
ab_sorted = batter_stats_filtered.sort_values('AB')
ab_smooth = gaussian_filter1d(ab_sorted['BA'], sigma=10)

fig.add_trace(go.Scatter(
    x=ab_sorted['AB'],
    y=ab_smooth,
    mode='lines',
    line=dict(color='red', width=3, dash='dash'),
    name='Smoothed Trend',
    showlegend=True
))

fig.update_layout(
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1'),
    yaxis=dict(gridcolor='#e1e1e1'),
    
)

fig.show()

In [None]:

fig = px.scatter(
    batter_stats_filtered,
    x='PA',
    y='HR_Rate',
    size='HR',
    color='ISO',
    hover_data=['Name', 'Tm', 'HR', 'AB'],
    color_continuous_scale='Hot',
    title='Home Run Rate vs Plate Appearances: Rate Stability',
    labels={'PA': 'Plate Appearances (Exposure)', 'HR_Rate': 'HR Rate (%)'}
)

fig.update_layout(
    height=700,
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    xaxis=dict(gridcolor='#e1e1e1', title='Plate Appearances (PA)'),
    yaxis=dict(gridcolor='#e1e1e1', title='Home Run Rate (%)'),
    coloraxis_colorbar=dict(title="ISO")
)

fig.show()

## Bayesian hypothesis testing

### 1. H: On-Base Percentage has a larger positive effect on team wins than Batting Average

The main hypothesis tested in this work is that On-Base Percentage has a stronger association with team wins than Batting Average. This hypothesis is formalized by comparing the posterior distributions of the corresponding regression coefficients. 

$$ H_0: \beta_{OBP} > \beta_{BA} $$

## Regression Models

## Hirerchical vs unpooled models

## Model Comparison (WAIC and LOO)