# Calculation of MLB Statistics (wOBA, wRAA, wRC)
This file consists the code to calculate wOBA, wRAA, and wRC manually since FanGraphs for these stats are hidden behind a paywall. Note that these are not qualified batters but instead just batters with at least 1 PA. The reason for this is because we want to see how these batters do on an aggregate basis rather than on an average since we're looking at wOBA, wRAA, and wRC.

## Aggregating MLB Data from StatCast

In [1]:
import pandas as pd

# Read data
df_2015 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2015.csv")
df_2016 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2016.csv")
df_2017 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2017.csv")
df_2018 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2018.csv")
df_2019 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2019.csv")
df_2020 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2020.csv")
df_2021 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2021.csv")
df_2022 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2022.csv")
df_2023 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2023.csv")
df_2024 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2024.csv")
df_2025 = pd.read_csv("data/statcast-major-leagues/mlb-stats-2025.csv")

# Concatenate data
df_mlb_stats = pd.concat(
    [
        df_2015,
        df_2016,
        df_2017,
        df_2018,
        df_2019,
        df_2020,
        df_2021,
        df_2022,
        df_2023,
        df_2024,
        df_2025,
    ]
)


# Column name fixing
df_mlb_stats.rename(
    columns={
        "last_name, first_name": "Name",
        "player_id": "StatCast_ID",
        "year": "Year",
        "player_age": "Age",
        "ab": "AB",
        "pa": "PA",
        "single": "1B",
        "double": "2B",
        "triple": "3B",
        "home_run": "HR",
        "strikeout": "K",
        "walk": "BB",
        "k_percent": "K%",
        "bb_percent": "BB%",
        "batting_avg": "BA",
        "slg_percent": "SLG",
        "on_base_percent": "OBP",
        "on_base_plus_slg": "OPS",
        "isolated_power": "ISO",
        "babip": "BABIP",
        "r_total_caught_stealing": "CS",
        "r_total_stolen_base": "SB",
        "b_hit_by_pitch": "HBP",
        "b_intent_walk": "IBB",
        "b_sac_fly": "SF",
        "woba": "wOBA",
        "sprint_speed": "Speed",
    },
    inplace=True,
)


# Fix name format from "last_name, first_name" to first name last name"
def fix_name(name: str) -> str:
    last, first = [part.strip() for part in name.split(",", 1)]
    return f"{first} {last}"


df_mlb_stats["Name"] = df_mlb_stats["Name"].apply(fix_name)

# Sort by name
df_mlb_stats = df_mlb_stats.sort_values(
    by=["Name", "Year"], ascending=True
).reset_index(drop=True)

# Display
df_mlb_stats

Unnamed: 0,Name,StatCast_ID,Year,Age,AB,PA,1B,2B,3B,HR,...,OPS,ISO,BABIP,CS,SB,HBP,IBB,SF,wOBA,Speed
0,A.J. Burnett,150359,2015,38,42,52,4,0,0,1,...,0.306,0.071,0.182,0,0,0,0,1,0.130,21.8
1,A.J. Cole,595918,2017,25,14,16,2,0,0,0,...,0.393,0.000,0.222,0,0,0,0,0,0.196,
2,A.J. Ellis,454560,2015,34,181,217,27,9,0,7,...,0.758,0.165,0.265,0,0,1,1,0,0.336,24.9
3,A.J. Ellis,454560,2016,35,171,196,27,8,0,2,...,0.599,0.082,0.252,1,2,2,2,1,0.266,24.6
4,A.J. Ellis,454560,2017,36,143,163,19,5,0,6,...,0.669,0.161,0.222,0,0,6,0,0,0.294,24.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,Óscar Mercado,640458,2019,24,438,482,75,25,3,15,...,0.761,0.174,0.300,4,15,5,0,4,0.321,29.5
7363,Óscar Mercado,640458,2020,25,86,93,9,1,0,1,...,0.348,0.046,0.169,0,3,0,0,1,0.159,28.1
7364,Óscar Mercado,640458,2021,26,214,238,30,11,1,6,...,0.669,0.145,0.253,1,7,2,0,0,0.294,28.9
7365,Óscar Mercado,640458,2022,27,121,128,14,6,1,4,...,0.614,0.165,0.236,2,2,1,0,1,0.266,28.4


## Manually calculating wOBA for more precision
The current wOBA values from StatCast is limited to only 3 decimal points so they're not as precised as they could be. To fix this, we can manually calculate the wOBA values and get more precised values for each player.

Formula: $\frac{(wBB*uBB)+(wHBP*HBP)+(w1B*1B)+(w2B*2B)+(w3B*3B)+(wHR*HR)}{AB+BB-IBB+SF+HBP}$

In [2]:
# Get woba values
woba_values = pd.read_csv("data/fixed-values/annual-woba-values.csv")


def calculate_woba(
    bb: int,
    hbp: int,
    single: int,
    double: int,
    triple: int,
    homerun: int,
    ab: int,
    ibb: int,
    sf: int,
    year: int,
) -> float:

    # Get weights
    wBB = woba_values.loc[woba_values["Season"] == year, "wBB"].iloc[0]
    wHBP = woba_values.loc[woba_values["Season"] == year, "wHBP"].iloc[0]
    w1B = woba_values.loc[woba_values["Season"] == year, "w1B"].iloc[0]
    w2B = woba_values.loc[woba_values["Season"] == year, "w2B"].iloc[0]
    w3B = woba_values.loc[woba_values["Season"] == year, "w3B"].iloc[0]
    wHR = woba_values.loc[woba_values["Season"] == year, "wHR"].iloc[0]

    # Numerator
    num = (
        (wBB * (bb - ibb))
        + (wHBP * hbp)
        + (w1B * single)
        + (w2B * double)
        + (w3B * triple)
        + (wHR * homerun)
    )

    # Denominator
    denom = ab + bb - ibb + sf + hbp

    # Final value
    woba = num / denom

    return woba


# Calculate wOBA
df_mlb_stats["wOBA_calculated"] = df_mlb_stats.apply(
    lambda row: calculate_woba(
        row["BB"],
        row["HBP"],
        row["1B"],
        row["2B"],
        row["3B"],
        row["HR"],
        row["AB"],
        row["IBB"],
        row["SF"],
        row["Year"],
    ),
    axis=1,
)

# Display data
df_mlb_stats

Unnamed: 0,Name,StatCast_ID,Year,Age,AB,PA,1B,2B,3B,HR,...,ISO,BABIP,CS,SB,HBP,IBB,SF,wOBA,Speed,wOBA_calculated
0,A.J. Burnett,150359,2015,38,42,52,4,0,0,1,...,0.071,0.182,0,0,0,0,1,0.130,21.8,0.129977
1,A.J. Cole,595918,2017,25,14,16,2,0,0,0,...,0.000,0.222,0,0,0,0,0,0.196,,0.196250
2,A.J. Ellis,454560,2015,34,181,217,27,9,0,7,...,0.165,0.265,0,0,1,1,0,0.336,24.9,0.335967
3,A.J. Ellis,454560,2016,35,171,196,27,8,0,2,...,0.082,0.252,1,2,2,2,1,0.266,24.6,0.266288
4,A.J. Ellis,454560,2017,36,143,163,19,5,0,6,...,0.161,0.222,0,0,6,0,0,0.294,24.1,0.294143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,Óscar Mercado,640458,2019,24,438,482,75,25,3,15,...,0.174,0.300,4,15,5,0,4,0.321,29.5,0.320583
7363,Óscar Mercado,640458,2020,25,86,93,9,1,0,1,...,0.046,0.169,0,3,0,0,1,0.159,28.1,0.159337
7364,Óscar Mercado,640458,2021,26,214,238,30,11,1,6,...,0.145,0.253,1,7,2,0,0,0.294,28.9,0.293747
7365,Óscar Mercado,640458,2022,27,121,128,14,6,1,4,...,0.165,0.236,2,2,1,0,1,0.266,28.4,0.265594


## Calculate wRAA

Formula: $(\frac{wOBA - lgwOBA}{wOBAScale})* PA$


In [3]:
def calculate_raa(woba: float, pa: float, year: int) -> float:
    league_woba = woba_values.loc[woba_values["Season"] == year, "wOBA"].iloc[0]
    woba_scale = woba_values.loc[woba_values["Season"] == year, "wOBAScale"].iloc[0]
    return ((woba - league_woba) / woba_scale) * pa


# Calculate wRAA
df_mlb_stats["wRAA"] = df_mlb_stats.apply(
    lambda row: calculate_raa(row["wOBA_calculated"], row["PA"], row["Year"]), axis=1
)

# Display data
df_mlb_stats

Unnamed: 0,Name,StatCast_ID,Year,Age,AB,PA,1B,2B,3B,HR,...,BABIP,CS,SB,HBP,IBB,SF,wOBA,Speed,wOBA_calculated,wRAA
0,A.J. Burnett,150359,2015,38,42,52,4,0,0,1,...,0.182,0,0,0,0,1,0.130,21.8,0.129977,-7.607681
1,A.J. Cole,595918,2017,25,14,16,2,0,0,0,...,0.222,0,0,0,0,0,0.196,,0.196250,-1.684388
2,A.J. Ellis,454560,2015,34,181,217,27,9,0,7,...,0.265,0,0,1,1,0,0.336,24.9,0.335967,3.983908
3,A.J. Ellis,454560,2016,35,171,196,27,8,0,2,...,0.252,1,2,2,2,1,0.266,24.6,0.266288,-8.362673
4,A.J. Ellis,454560,2017,36,143,163,19,5,0,6,...,0.222,0,0,6,0,0,0.294,24.1,0.294143,-3.694274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,Óscar Mercado,640458,2019,24,438,482,75,25,3,15,...,0.300,4,15,5,0,4,0.321,29.5,0.320583,0.242940
7363,Óscar Mercado,640458,2020,25,86,93,9,1,0,1,...,0.169,0,3,0,0,1,0.159,28.1,0.159337,-12.608998
7364,Óscar Mercado,640458,2021,26,214,238,30,11,1,6,...,0.253,1,7,2,0,0,0.294,28.9,0.293747,-3.986975
7365,Óscar Mercado,640458,2022,27,121,128,14,6,1,4,...,0.236,2,2,1,0,1,0.266,28.4,0.265594,-4.514694


## Calculate wRC

Formula: $(\frac{wOBA - lgwOBA}{wOBAScale} + \frac{lgRun}{PA}) * PA$

In [4]:
def calculate_wrc(woba: float, pa: int, year: int) -> float:
    league_woba = woba_values.loc[woba_values["Season"] == year, "wOBA"].iloc[0]
    woba_scale = woba_values.loc[woba_values["Season"] == year, "wOBAScale"].iloc[0]
    league_runs_per_pa = woba_values.loc[woba_values["Season"] == year, "R/PA"].iloc[0]
    return (((woba - league_woba) / woba_scale) + league_runs_per_pa) * pa


# Calculate wRC
df_mlb_stats["wRC"] = df_mlb_stats.apply(
    lambda row: calculate_wrc(row["wOBA_calculated"], row["PA"], row["Year"]), axis=1
)

# Display data
df_mlb_stats

Unnamed: 0,Name,StatCast_ID,Year,Age,AB,PA,1B,2B,3B,HR,...,CS,SB,HBP,IBB,SF,wOBA,Speed,wOBA_calculated,wRAA,wRC
0,A.J. Burnett,150359,2015,38,42,52,4,0,0,1,...,0,0,0,0,1,0.130,21.8,0.129977,-7.607681,-1.783681
1,A.J. Cole,595918,2017,25,14,16,2,0,0,0,...,0,0,0,0,0,0.196,,0.196250,-1.684388,0.267612
2,A.J. Ellis,454560,2015,34,181,217,27,9,0,7,...,0,0,1,1,0,0.336,24.9,0.335967,3.983908,28.287908
3,A.J. Ellis,454560,2016,35,171,196,27,8,0,2,...,1,2,2,2,1,0.266,24.6,0.266288,-8.362673,14.765327
4,A.J. Ellis,454560,2017,36,143,163,19,5,0,6,...,0,0,6,0,0,0.294,24.1,0.294143,-3.694274,16.191726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,Óscar Mercado,640458,2019,24,438,482,75,25,3,15,...,4,15,5,0,4,0.321,29.5,0.320583,0.242940,60.974940
7363,Óscar Mercado,640458,2020,25,86,93,9,1,0,1,...,0,3,0,0,1,0.159,28.1,0.159337,-12.608998,-0.983998
7364,Óscar Mercado,640458,2021,26,214,238,30,11,1,6,...,1,7,2,0,0,0.294,28.9,0.293747,-3.986975,24.811025
7365,Óscar Mercado,640458,2022,27,121,128,14,6,1,4,...,2,2,1,0,1,0.266,28.4,0.265594,-4.514694,10.077306


## TODO: Calculate wRC+

Formula: $\frac{(wRAA/PA + lgRun/PA) + (lgRun/PA - (ParkFactor * lgRun/PA))}{wRC^*/PA} * PA$

where $\frac{wRC^*}{PA}$ = AL or NL $\frac{wRC}{PA}$ excluding pitchers

In [5]:
def calculate_wrc_plus(wraa: float, pa: int, year: int) -> float:
    return 0

## Calculate Aggregate Stats in the MLB
Calculate the total career counting stats for the players in the MLB. This way, we will know how much a player is contributing to their team across their career which is the point of creating a minor league batting prediction model - to assess future total potential and contribution of a player.

### Career wOBA

In [6]:
def calculate_woba_numerator(
    bb: int,
    hbp: int,
    single: int,
    double: int,
    triple: int,
    homerun: int,
    ab: int,
    ibb: int,
    sf: int,
    year: int,
) -> float:

    # Get weights
    wBB = woba_values.loc[woba_values["Season"] == year, "wBB"].iloc[0]
    wHBP = woba_values.loc[woba_values["Season"] == year, "wHBP"].iloc[0]
    w1B = woba_values.loc[woba_values["Season"] == year, "w1B"].iloc[0]
    w2B = woba_values.loc[woba_values["Season"] == year, "w2B"].iloc[0]
    w3B = woba_values.loc[woba_values["Season"] == year, "w3B"].iloc[0]
    wHR = woba_values.loc[woba_values["Season"] == year, "wHR"].iloc[0]

    # Numerator
    num = (
        (wBB * (bb - ibb))
        + (wHBP * hbp)
        + (w1B * single)
        + (w2B * double)
        + (w3B * triple)
        + (wHR * homerun)
    )

    return num


# Calculate numerator wOBA
df_mlb_stats["numerator_woba"] = df_mlb_stats.apply(
    lambda row: calculate_woba_numerator(
        row["BB"],
        row["HBP"],
        row["1B"],
        row["2B"],
        row["3B"],
        row["HR"],
        row["AB"],
        row["IBB"],
        row["SF"],
        row["Year"],
    ),
    axis=1,
)
df_mlb_stats

Unnamed: 0,Name,StatCast_ID,Year,Age,AB,PA,1B,2B,3B,HR,...,SB,HBP,IBB,SF,wOBA,Speed,wOBA_calculated,wRAA,wRC,numerator_woba
0,A.J. Burnett,150359,2015,38,42,52,4,0,0,1,...,0,0,0,1,0.130,21.8,0.129977,-7.607681,-1.783681,5.589
1,A.J. Cole,595918,2017,25,14,16,2,0,0,0,...,0,0,0,0,0.196,,0.196250,-1.684388,0.267612,3.140
2,A.J. Ellis,454560,2015,34,181,217,27,9,0,7,...,0,1,1,0,0.336,24.9,0.335967,3.983908,28.287908,71.561
3,A.J. Ellis,454560,2016,35,171,196,27,8,0,2,...,2,2,2,1,0.266,24.6,0.266288,-8.362673,14.765327,50.861
4,A.J. Ellis,454560,2017,36,143,163,19,5,0,6,...,0,6,0,0,0.294,24.1,0.294143,-3.694274,16.191726,47.357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,Óscar Mercado,640458,2019,24,438,482,75,25,3,15,...,15,5,0,4,0.321,29.5,0.320583,0.242940,60.974940,152.277
7363,Óscar Mercado,640458,2020,25,86,93,9,1,0,1,...,3,0,0,1,0.159,28.1,0.159337,-12.608998,-0.983998,14.659
7364,Óscar Mercado,640458,2021,26,214,238,30,11,1,6,...,7,2,0,0,0.294,28.9,0.293747,-3.986975,24.811025,69.618
7365,Óscar Mercado,640458,2022,27,121,128,14,6,1,4,...,2,1,0,1,0.266,28.4,0.265594,-4.514694,10.077306,33.996


In [7]:
# Aggregating counting stats
df_career_stats = (
    df_mlb_stats.groupby(["Name", "StatCast_ID"])
    .agg(
        {
            "AB": "sum",
            "PA": "sum",
            "1B": "sum",
            "2B": "sum",
            "3B": "sum",
            "HR": "sum",
            "K": "sum",
            "BB": "sum",
            "CS": "sum",
            "SB": "sum",
            "HBP": "sum",
            "IBB": "sum",
            "SF": "sum",
            "Speed": "mean",
            "numerator_woba": "sum",  # use sum to calculate total wOBA then divide it by AB+BB-IBB+HBP+SF
        }
    )
    .reset_index()
)

# Calculating rate stats
df_career_stats["K%"] = round(((df_career_stats["K"] / df_career_stats["PA"]) * 100), 3)
df_career_stats["BB%"] = round(
    ((df_career_stats["BB"] / df_career_stats["PA"]) * 100), 3
)
df_career_stats["BA"] = round(
    (
        (
            (
                df_career_stats["1B"]
                + df_career_stats["2B"]
                + df_career_stats["3B"]
                + df_career_stats["HR"]
            )
            / df_career_stats["AB"]
        )
    ),
    3,
)
df_career_stats["SLG"] = round(
    (
        (
            (
                df_career_stats["1B"]
                + df_career_stats["2B"] * 2
                + df_career_stats["3B"] * 3
                + df_career_stats["HR"] * 4
            )
            / df_career_stats["AB"]
        )
    ),
    3,
)
df_career_stats["OBP"] = round(
    (
        (
            (
                df_career_stats["1B"]
                + df_career_stats["2B"]
                + df_career_stats["3B"]
                + df_career_stats["HR"]
                + df_career_stats["BB"]
                + df_career_stats["HBP"]
            )
            / (
                df_career_stats["AB"]
                + df_career_stats["BB"]
                + df_career_stats["HBP"]
                + df_career_stats["SF"]
            )
        )
    ),
    3,
)
df_career_stats["OPS"] = round(df_career_stats["OBP"] + df_career_stats["SLG"], 3)
df_career_stats["ISO"] = round(
    (
        (
            (
                df_career_stats["2B"] * 1
                + df_career_stats["3B"] * 2
                + df_career_stats["HR"] * 3
            )
            / df_career_stats["AB"]
        )
    ),
    3,
)
df_career_stats["BABIP"] = round(
    (
        (
            (df_career_stats["1B"] + df_career_stats["2B"] + df_career_stats["3B"])
            / (
                df_career_stats["AB"]
                - df_career_stats["K"]
                - df_career_stats["HR"]
                + df_career_stats["SF"]
            )
        )
    ),
    3,
)
df_career_stats["wOBA"] = round(
    (df_career_stats["numerator_woba"])
    / (
        df_career_stats["AB"]
        + df_career_stats["BB"]
        - df_career_stats["IBB"]
        + df_career_stats["HBP"]
        + df_career_stats["SF"]
    ),
    3,
)

# Remove numerator_wOBA from df_mlb_stats
df_mlb_stats.drop("numerator_woba", axis=1, inplace=True)

# Reorder
col_order = [
    "Name",
    "StatCast_ID",
    "AB",
    "PA",
    "1B",
    "2B",
    "3B",
    "HR",
    "K",
    "BB",
    "K%",
    "BB%",
    "BA",
    "SLG",
    "OBP",
    "OPS",
    "ISO",
    "BABIP",
    "CS",
    "SB",
    "HBP",
    "IBB",
    "SF",
    "wOBA",
    "Speed",
]
df_career_stats = df_career_stats[col_order]

# Display
df_career_stats

Unnamed: 0,Name,StatCast_ID,AB,PA,1B,2B,3B,HR,K,BB,...,OPS,ISO,BABIP,CS,SB,HBP,IBB,SF,wOBA,Speed
0,A.J. Burnett,150359,42,52,4,0,0,1,20,0,...,0.306,0.071,0.182,0,0,0,0,1,0.130,21.800
1,A.J. Cole,595918,14,16,2,0,0,0,5,2,...,0.393,0.000,0.222,0,0,0,0,0,0.196,
2,A.J. Ellis,454560,646,759,105,30,0,16,135,89,...,0.688,0.121,0.271,1,2,10,4,3,0.306,24.425
3,A.J. Jimenez,543362,12,13,1,0,0,0,7,0,...,0.166,0.000,0.200,0,0,0,0,0,0.073,
4,A.J. Pierzynski,150229,654,695,125,39,1,11,66,25,...,0.686,0.113,0.283,2,1,10,3,6,0.297,24.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,Zack Short,670097,506,594,54,18,0,15,172,70,...,0.567,0.125,0.220,2,10,2,1,9,0.255,27.650
1972,Zack Wheeler,554430,208,226,28,7,0,1,91,3,...,0.413,0.048,0.299,0,0,2,0,1,0.181,24.550
1973,Ángel Pagán,434636,1007,1092,203,45,8,15,159,74,...,0.691,0.105,0.304,8,27,1,1,9,0.302,27.900
1974,Óscar Hernández,591712,42,47,5,1,0,1,15,3,...,0.501,0.095,0.231,0,0,1,0,0,0.227,26.600


## Save data

In [9]:
df_mlb_stats.to_csv("data/full-mlb-stats.csv", index=False)
df_career_stats.to_csv("data/mlb-career-stats.csv", index=False)