In [None]:
import numpy as np
import math
import os
import pandas as pd
import pickle
import psycopg2
import scipy
import sklearn
import xgboost as xgb #v1.6.0
import zipfile

import matplotlib.pyplot as plt
import seaborn as sns

from dotenv import load_dotenv
from pathlib import Path
from scipy import stats

In [None]:
pl_white = '#FEFEFE'
pl_background = '#162B50'
pl_text = '#72a3f7'
pl_line_color = '#293a6b'

sns.set_theme(
    style={
        'axes.edgecolor': pl_line_color,
        'axes.facecolor': pl_background,
        'axes.labelcolor': pl_white,
        'xtick.color': pl_white,
        'ytick.color': pl_white,
        'figure.facecolor':pl_background,
        'grid.color': pl_background,
        'grid.linestyle': '-',
        'legend.facecolor':pl_background,
        'text.color': pl_white
     }
    )

In [None]:
dotenv_path = Path('pitcherlist_datascience.env')
load_dotenv(dotenv_path=dotenv_path)

conn = psycopg2.connect(f"dbname='{os.environ.get('PL_DB_DATABASE')}' user='{os.environ.get('PL_DB_USER')}' host='{os.environ.get('PL_DB_HOST')}' password='{os.environ.get('PL_DB_PASSWORD')}'")
cursor = conn.cursor()

cursor.execute("Select * FROM plv_inputs where year_played >= '2019'")
colnames = [desc[0] for desc in cursor.description]
data = cursor.fetchall()

db_data = pd.DataFrame(data).copy()
db_data.columns = colnames

cursor.close()
conn.close()

In [None]:
db_data.columns.values

In [None]:
db_data['cleaned_description'] = db_data['pitch_description'].replace({
    'Ball':'ball', 
    'Foul Ball':'foul_strike', 
    'Strike Swinging':'swinging_strike', 
    'Strike Looking':'called_strike',
    'Homerun':'home_run', 
    'Single':'single', 
    'Ground Out':'out', 
    'Foul Tip':'swinging_strike', 
    'Fielders Choice':'out',
    'Dirt Ball':'ball', 
    'Double':'double', 
    'Pop Out':'out', 
    'Hit By Pitch':'hit_by_pitch', 
    'Fly Out':'out',
    'Line Out':'out', 
    'Reached On Error':'out', 
    'Single - Adv 2nd':'single',
    'Fielders Choice - Adv 2nd':'out', 
    'Sacrifice Fly':'out',
    'Reached On Error - Adv 2nd':'out', 
    'Pitchout':'ball', 
    'Triple - Out at Home':'triple',
    'Triple':'triple', 
    'Strike Swinging - Adv 1st':'swinging_strike', 
    'Single - Out at 2nd':'single',
    'Sacrifice Bunt':'out',
    'Double - Out at 3rd':'double',
    'Single - Adv 3rd':'single', 
    'Double - Adv 3rd':'double',
    'Reached On Error - Out at 2nd':'out', 
    'Sacrifice Bunt - Adv 1st':'out',
    'Reached On Error - Adv 3rd':'out', 
    'Reached On Error - Adv Home':'out',
    'Fielders Choice - Out at 2nd':'out', 
    'Triple - Adv Home':'triple',
    'Sacrifice Fly - Adv 1st':'out', 
    'Strike Swinging - Adv 2nd':'swinging_strike',
    'Double - Adv Home':'double', 
    'Sacrifice Bunt - Adv 2nd':'out',
    'Fielders Choice - Adv 3rd':'out', 
    'Single - Out at 3rd':'single',
    'Single - Adv Home':'single', 
    'Single - Out at Home':'single',
    'Sacrifice Bunt - Adv 3rd':'out', 
    'Fielders Choice - Out at 3rd':'out',
    'Sacrifice Bunt - Out at 2nd':'out', 
    'Sacrifice Fly - Adv 2nd':'out',
    'Reached On Error - Out at 3rd':'out', 
    'Double - Out at Home':'double',
    'Enforced Ball':'ball',
    'Intentional Walk':'ball',
    'Single - Tagged out at 1st':'single', 
    'Double - Tagged out at 2nd':'double',
    'Strike Swinging - Out at 2nd':'swinging_strike', 
    'Strike Swinging - Adv 3rd':'swinging_strike', 
    'Sacrifice Fly - Adv 3rd':'out'
})

In [None]:
db_data.columns.values

In [None]:
db_data['strike'] = np.where(db_data['cleaned_description'].isin(['out', 'single','double','triple','home_run',
                                                                  'called_strike','foul_strike','swinging_strike']),1,0)

In [None]:
db_data[['launch_speed','launch_angle']] = db_data[['launch_speed','launch_angle']].astype('float')

In [None]:
db_data['icr'] = 0
db_data.loc[db_data['launch_speed'].notna() | 
            db_data['launch_angle'].notna(),'icr'] = 0

db_data.loc[((db_data['launch_speed'] + db_data['launch_angle'] * 2.4)>=98) &
            ((db_data['launch_speed'] - db_data['launch_angle'])>=76) &
            (db_data['launch_angle']<=30) &
            (db_data['launch_speed']>=95),'icr'] = 1
    
db_data.loc[((db_data['launch_speed'] + db_data['launch_angle'] * 2.4)>=98) &
            (db_data['launch_angle']<=20) &
            (db_data['launch_speed']>=86) &
            (db_data['launch_speed']<=95),'icr'] = 1

db_data.loc[((db_data['launch_speed'] + db_data['launch_angle'] * 1.3)<=112) &
            ((db_data['launch_speed'] + db_data['launch_angle'] * 1.55)>=92) &
            (db_data['launch_speed']>=72) &
            (db_data['launch_speed']<=86),'icr'] = 1

db_data.loc[((db_data['launch_speed'] * 2 - db_data['launch_angle'])>=87) &
            ((db_data['launch_speed'] * 2 + db_data['launch_angle'])<=175) &
            ((db_data['launch_speed'] + db_data['launch_angle'] * 1.3)>=89) &
            (db_data['launch_speed']>=59) &
            (db_data['launch_speed']<=72) &
            (db_data['launch_angle']<=41),'icr'] = 1

db_data.loc[((db_data['launch_speed'] * 1.5 - db_data['launch_angle'])>=111) &
            ((db_data['launch_speed'] + db_data['launch_angle'])>=119) &
            (db_data['launch_speed']>=95) &
            (db_data['launch_angle']>=0) &
            (db_data['launch_angle']<=52),'icr'] = 1

db_data.loc[((db_data['launch_speed'] * 1.5 - db_data['launch_angle'])>=117) &
            ((db_data['launch_speed'] + db_data['launch_angle'])>=124) &
            (db_data['launch_speed']>=98) &
            (db_data['launch_angle']>=4) &
            (db_data['launch_angle']<=50),'icr'] = 1

In [None]:
db_data['icr'].value_counts()

In [None]:
db_data.loc[db_data['icr']==1,'strike'] = 1
db_data['str-icr'] = db_data['strike'].sub(db_data['icr'])
db_data['wstr-icr'] = db_data['strike'].sub(db_data['icr'].mul(2))
db_data['icr/str'] = db_data['icr'].div(db_data['strike'])

In [None]:
db_data['str-icr'].value_counts()

In [None]:
#Jonah Pemstein
#Cronbach's alpha to measure reliability of different stats

#Import modules
import random
import timeit
import os
print("Modules all imported")

#Start time
stime = timeit.default_timer() #start time

#Define functions
def time(start, msg="   "):
    curtime = timeit.default_timer()
    tottime = curtime-start
    hours = math.floor(tottime/3600)
    minutes = math.floor(tottime/60)-hours*60
    seconds = tottime % 60
    if seconds < 10:
        seconds = "0"+str(round(seconds,1))
    else:
        seconds = str(round(seconds,1))
    if minutes < 10:
        minutes = "0"+str(minutes)
    if hours < 10:
        hours = "0"+str(hours)
    print(msg, "Time elapsed: "+str(hours)+":"+str(minutes)+":"+str(seconds))

def alpha(prepped):
    #Calculate Cronbach's alpha
    stdx = np.std(prepped.sum())
    varx = stdx*stdx #The variance of all total scores
    fpv = prepped.transpose()
    stdy = fpv.std()
    vary = stdy*stdy #The variances of every player-year's scores
    k = prepped.shape[0] #Number of "test items", in this case balls in play
    kterm = k/(k-1)
    sum_vary = np.sum(vary) #The sum of all variances of total scores
    varterm = 1-(sum_vary/varx)
    return(kterm * varterm)

def calculate(statlist, data, playeridtype, yearcolumn, denom_name, yearrange, playertype, path, maxdenom, increment, extradenom=[]):
    #Create dictionary with every increment of denominator desired
    statnum=[]
    for i in range(1, int(maxdenom/increment)):
        statnum.append(i*increment)
    statnum.extend(extradenom)
    statnum.sort()
    stat_dict = {denom_name:statnum}

    alpha_df, mean_df, sd_df, count_df = pd.DataFrame(stat_dict),pd.DataFrame(stat_dict),pd.DataFrame(stat_dict),pd.DataFrame(stat_dict)  #create dataframes with every increment of denominator desired

    playerlist = list(data[playeridtype].unique()) #make a list of all player IDs

    #Iterate through different statistics
    for stat in statlist:
        alpha_list, mean_list, sd_list, count_list = [],[],[],[] #clear list of alphas, means, standard deviations, and sample sizes
        nums_dict = {} #Create empty dictionary
        for i in playerlist: #Populate dictionary with batter numbers for the given statistic
            for y in yearrange:
                nums = pd.Series.tolist(data[(data[playeridtype] == i) & (data[yearcolumn] == y)][stat])
                nums_dict[str(i)+str(y)] = nums

      #Iterate through different numbers of events, each time creating a dataframe that alpha can be calculated from
        for samplesize in stat_dict[denom_name]:
            #Create empty dataframe
            x = {}
            prepped = pd.DataFrame(x)
            #Fill that dataframe with a random sample of events
            for i in nums_dict:
                if len(nums_dict[i]) >= samplesize:
                    prepped[str(i)] = random.sample(nums_dict[i], samplesize) #Add the random sample to the prepped dataframe that will be used to calculate alpha

            if prepped.shape[1] >= 5: #If there are at least five players with enough events, add alpha to the list of alphas for that stat (and mean, standard deviation, and count)
                a = alpha(prepped)
                alpha_list.append(a)
                m = np.mean(prepped.mean())
                mean_list.append(m)
                s = np.std(prepped.mean())
                sd_list.append(s)
                n = prepped.shape[1]
                count_list.append(n)
            else: #There aren't enough batters to calculate alpha
                break #stop calculating alpha for this stat and move on to the next stat

        #Add that list of alphas for that stat to the dataframe containing alpha for all stats
        alpha_df, mean_df, sd_df, count_df = alpha_df.loc[:len(alpha_list)-1], mean_df.loc[:len(mean_list)-1], sd_df.loc[:len(sd_list)-1], count_df.loc[:len(count_list)-1]
        alpha_df[stat], mean_df[stat], sd_df[stat], count_df[stat] = (alpha_list, mean_list, sd_list, count_list)
        time(stime,msg="Completed "+stat+" for "+path+".")
    alpha_df.to_csv(path+"_alpha.csv",index=False)
    mean_df.to_csv(path+"_mean.csv",index=False)
    sd_df.to_csv(path+"_sd.csv",index=False)
    count_df.to_csv(path+"_count.csv",index=False)
    print("Completed", path)

print("Functions all defined")

In [None]:
# calculate(['str-icr','wstr-icr'], db_data, "pitchername", "year_played", "Pitches", range(2019, 2024), "Pitchers", "Str-ICR", 1600, 25, [5])

In [None]:
# calculate(['icr'], db_data.sort_values(['pitch_id','game_played']).loc[(db_data['cleaned_description'].isin(['out','single','double','triple','home_run']))],
#           "pitchername", "year_played", "BBE", range(2019, 2024), "Pitchers", "ICR-BBE", 30, 5, [2,3,4])

In [None]:
stability_df = pd.read_csv('Str-ICR_alpha.csv')[['Pitches','wstr-icr']]

threshold = 0.7
sample_needed = stability_df[stability_df['wstr-icr']>=threshold].iloc[0]['Pitches']

fig, ax = plt.subplots(figsize=(6,4))
sns.lineplot(data=stability_df,
             x='Pitches',
             y='wstr-icr',
             linewidth=3,
             legend=False
            )
ax.set(xlim=(0,1.5*sample_needed), xlabel='Pitches',ylim=(0,0.8),ylabel='Alpha', title='Str-ICR Stabilizations (Per-Pitch)')
ax.axvline(sample_needed,
           ymin=0.15/ax.get_ylim()[1],
           ymax=threshold/ax.get_ylim()[1],
          color='w',
          linestyle='--')
ax.axhline(threshold,
           xmax=sample_needed/ax.get_xlim()[1],
          color='w',
          linestyle='--')
ax.plot(sample_needed,threshold,
        marker='*',markersize=25,
        color='w')
ax.plot(sample_needed,threshold,
        marker='*',markersize=15,
        color=sns.color_palette('vlag',n_colors=1000)[-1])
ax.text(sample_needed,0.075,
       f'{sample_needed:.0f}\nPitches',
       ha='center',va='center')
sns.despine()

In [None]:
fg_whip_df = pd.read_csv('WHIP Leaderboard.csv')
fg_whip_df['IP'] = fg_whip_df['IP'].astype('int') + fg_whip_df['IP'].astype('string').str[-1].astype('int').div(3)
fg_whip_df['Hit%'] = fg_whip_df['IP'].mul(fg_whip_df['WHIP']).round(0).sub(fg_whip_df['TBF'].mul(fg_whip_df['BB%']).round(0)).div(fg_whip_df['TBF'])
fg_whip_df.sample(10)

In [None]:
fg_whip_df = fg_whip_df.sort_values(['MLBAMID','Season'])

fg_whip_df['WHIP_y+1'] = fg_whip_df['WHIP'].shift(-1)
fg_whip_df.loc[(fg_whip_df['MLBAMID'] != fg_whip_df['MLBAMID'].shift(-1)) |
               (fg_whip_df['Season']+1 != fg_whip_df['Season'].shift(-1)),
               'WHIP_y+1'] = np.NaN

In [None]:
season_df = (db_data
 .groupby(['year_played','pitcher_mlb_id','pitchername'])
 [['pitch_id','strike','icr','str-icr','wstr-icr','icr/str']]
 .agg({
     'pitch_id':'count',
     'strike':'mean',
     'icr':'mean',
     'str-icr':'mean',
     'wstr-icr':'mean',
     'icr/str':'mean'
 })
#  .query('pitch_id>=750')
 .reset_index()
)
season_df.sample(10)

In [None]:
regress_df = pd.merge(season_df[['year_played','pitcher_mlb_id','pitchername','pitch_id','strike','icr']],
                      fg_whip_df[['Season','MLBAMID','TBF','WHIP']],
                      left_on=['year_played','pitcher_mlb_id'],
                      right_on=['Season','MLBAMID'],
                      how='inner')
regress_df['str-icr'] = regress_df['strike'].sub(regress_df['icr'])
regress_df['wstr-icr'] = regress_df['strike'].sub(regress_df['icr'].mul(2))
regress_df.sample(10)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

target='WHIP'
feats=['strike','icr']

test_data = regress_df.loc[regress_df.index.repeat(regress_df.TBF)].reset_index(drop=True)
test_data = test_data[feats+[target]].dropna()
target_data = test_data.loc[test_data[target].isnull()].copy()
_model_data = test_data.loc[test_data[target].notnull()].copy().fillna(test_data[target].median())

df_pred = pd.DataFrame()
kf = KFold(10, shuffle=True)
for train_idx, test_idx in kf.split(_model_data):
    train_data = _model_data.iloc[train_idx]
    test_data = _model_data.iloc[test_idx]

    model = LinearRegression()
    model.fit(train_data.loc[:,feats],
              train_data.loc[:,target])
# [round(x/sum(model.coef_),2) for x in model.coef_], list(model.feature_names_in_)
list(model.coef_), list(model.feature_names_in_)

In [None]:
print(model.intercept_)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

target='WHIP'
feats=['wstr-icr']

test_data = regress_df.loc[regress_df.index.repeat(regress_df.pitch_id)].reset_index(drop=True)
test_data = test_data[feats+[target]].dropna()
target_data = test_data.loc[test_data[target].isnull()].copy()
_model_data = test_data.loc[test_data[target].notnull()].copy().fillna(test_data[target].median())

df_pred = pd.DataFrame()
kf = KFold(10, shuffle=True)
for train_idx, test_idx in kf.split(_model_data):
    train_data = _model_data.iloc[train_idx]
    test_data = _model_data.iloc[test_idx]

    model = LinearRegression()
    model.fit(train_data.loc[:,feats],
              train_data.loc[:,target])
# [round(x/sum(model.coef_),2) for x in model.coef_], list(model.feature_names_in_)
list(model.coef_), list(model.feature_names_in_)

In [None]:
weighted_val = 1/model.coef_[0]
weighted_val

In [None]:
whip_intercept = model.intercept_
whip_intercept

In [None]:
regress_df['WHIP_pred'] = whip_intercept + regress_df['wstr-icr'].div(weighted_val)
regress_df = regress_df.sort_values(['pitcher_mlb_id','year_played'])

for stat in ['wstr-icr','WHIP','WHIP_pred']:
    regress_df[f'{stat}_y+1'] = regress_df[f'{stat}'].shift(-1)
    regress_df.loc[(regress_df['pitcher_mlb_id'] != regress_df['pitcher_mlb_id'].shift(-1)) |
                   (regress_df['year_played']+1 != regress_df['year_played'].shift(-1)),
                   f'{stat}_y+1'] = np.NaN

In [None]:
def r2_calc(x, y):
    return stats.pearsonr(x, y)[0] ** 2

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
sns.regplot(data=regress_df,
                x='WHIP',
                y='WHIP_pred',
            scatter_kws=dict(edgecolor='w',linewidth=0.5),
            line_kws=dict(color='w', linestyle='--'))
r2_val = r2_calc(regress_df['WHIP'],regress_df['WHIP_pred'])
mae_val = regress_df['WHIP'].sub(regress_df['WHIP_pred']).abs().mean()
ax.text(0.6,1.7,f'r^2 = {r2_val:.2f}\nMAE = {mae_val:.2f}',size=12)
ax.set(xlim=(regress_df['WHIP'].min()-0.1,regress_df['WHIP'].max()+0.1),
       ylim=(regress_df['WHIP'].min()-0.1,regress_df['WHIP'].max()+0.1),
       aspect=1)
fig.suptitle('WHIP vs Str-ICR WHIP\n(Min 750 pitches & 60 IP)')
sns.despine()

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
chart_df = fg_whip_df.dropna(subset=['WHIP_y+1']).copy()
sns.regplot(data=chart_df,
                x='WHIP',
                y='WHIP_y+1',
            scatter_kws=dict(edgecolor='w',linewidth=0.5),
            line_kws=dict(color='w', linestyle='--'))
r2_val = r2_calc(chart_df['WHIP'],chart_df['WHIP_y+1'])
mae_val = chart_df['WHIP'].sub(chart_df['WHIP_y+1']).abs().mean()
ax.text(1.4,0.5,f'r^2 = {r2_val:.2f}\nMAE = {mae_val:.2f}',size=12)
ax.set(xlim=(chart_df['WHIP_y+1'].min()-0.2,chart_df['WHIP_y+1'].max()+0.2),
       ylim=(chart_df['WHIP_y+1'].min()-0.2,chart_df['WHIP_y+1'].max()+0.2),
       aspect=1)
fig.suptitle('WHIP vs Next Year WHIP\n(Min 60 IP both seasons)')
sns.despine()

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
chart_df = regress_df.dropna(subset=['wstr-icr_y+1']).copy()
sns.regplot(data=chart_df,
                x='wstr-icr',
                y='wstr-icr_y+1',
            scatter_kws=dict(edgecolor='w',linewidth=0.5),
            line_kws=dict(color='w', linestyle='--'))
r2_val = r2_calc(chart_df['wstr-icr'],chart_df['wstr-icr_y+1'])
mae_val = chart_df['wstr-icr'].sub(chart_df['wstr-icr_y+1']).abs().mean()
ax.text(0.6,1.4,f'r^2 = {r2_val:.2f}\nMAE = {mae_val:.2f}',size=12)
ax.set(xlim=(chart_df['wstr-icr'].min()-0.15,chart_df['wstr-icr'].max()+0.15),
       ylim=(chart_df['wstr-icr'].min()-0.15,chart_df['wstr-icr'].max()+0.15),
       aspect=1)
fig.suptitle('Str-ICR WHIP vs Next Year Str-ICR WHIP\n(Min 750 pitches both seasons)')
sns.despine()

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
chart_df = regress_df.dropna(subset=['WHIP_y+1']).copy()
sns.regplot(data=chart_df,
                x='WHIP_pred',
                y='WHIP_y+1',
            scatter_kws=dict(edgecolor='w',linewidth=0.5),
            line_kws=dict(color='w', linestyle='--'))
r2_val = r2_calc(chart_df['WHIP_pred'],chart_df['WHIP_y+1'])
mae_val = chart_df['WHIP_pred'].sub(chart_df['WHIP_y+1']).abs().mean()
ax.text(1.4,0.5,f'r^2 = {r2_val:.2f}\nMAE = {mae_val:.2f}',size=12)
ax.set(xlim=(chart_df['WHIP_y+1'].min()-0.2,chart_df['WHIP_y+1'].max()+0.2),
       ylim=(chart_df['WHIP_y+1'].min()-0.2,chart_df['WHIP_y+1'].max()+0.2),
       aspect=1)
fig.suptitle('Str-ICR WHIP vs Next-Season WHIP\n(Min 750 pitches & 60 IP)')
sns.despine()

In [None]:
(db_data
 .loc[db_data['year_played']>=2023]
 .groupby(['year_played','pitcher_mlb_id','pitchername','pitchtype'])
 [['pitch_id','strike','icr','str-icr','wstr-icr','icr/str']]
 .agg({
     'pitch_id':'count',
     'strike':'mean',
     'icr':'mean',
     'str-icr':'mean',
     'wstr-icr':'mean',
     'icr/str':'mean'
 })
 .query('pitch_id>=200')
 .sort_values('str-icr', ascending=False)
 .reset_index()
#  .to_csv('str-icr_data_pitchtype.csv', encoding='latin1',index=False)
#  .head(20)
)

In [None]:
def shaded_kdeplot(df,stat,quantiles=[0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 1],title='',sort_order='ascending'):
    ### Put quantiles in order and sort descending 
    # (because I'm too lazy to fix the graph code to be ascending)
    quantiles.sort(reverse=True)
    
    fig, ax = plt.subplots(figsize=(8,3))
    
    ### Generic color for text color, median line, and to outline the kdeplot & text box
    outline_color = 'k'
    xlim_adj = df[stat].sub(50).abs().max() * 1.05
    ### General kde line
    sns.kdeplot(df[stat], ax=ax, color=outline_color, legend=False, cut=0)

    ### Get x,y values for graph
    x = ax.lines[-1].get_xdata()
    y = ax.lines[-1].get_ydata()

    ### Distribute colors evenly across the 'vlag' color pattern
    palette = 'vlag'
    n_colors = (len(quantiles)-2)*1000+1
    quant_colors = sns.color_palette(f'{palette}_r' if sort_order=='ascending' else palette,
                                     n_colors=n_colors)[::1000]
    
    ### Fill between quantiles with appropriate color
    for quant in range(len(quantiles)-1):
        color = quant_colors[quant]
        thresh = df[stat].max()+1 if quant==0 else df[stat].quantile(quantiles[quant])
        ax.fill_between(x, 0, y, 
                        where=x < thresh, 
                        color=quant_colors[quant], 
                        alpha=1)
        
    ### End lines at min,max
    ax.vlines(df[stat].quantile(0), 
            0, 
            np.interp(df[stat].quantile(0), x, y), 
            linestyle='-', color=outline_color, alpha=1, linewidth=2)
    ax.vlines(df[stat].quantile(1), 
            0, 
            np.interp(df[stat].quantile(1), x, y), 
            linestyle='-', color=outline_color, alpha=1, linewidth=2)
    
    ### Plot median value and line (value is percent in this case)
    val = df[stat].quantile(0.5)
    ax.axvline(val,
               ymax=0.9,
               linestyle='--', 
               color=outline_color, 
               linewidth=2)
    props = dict(boxstyle='Round',
                 facecolor='w', 
                 alpha=1, 
                 edgecolor=outline_color,
                 linewidth=2)
    y_max = ax.get_ylim()[1]
    ax.text(val,
            y_max*1.15,
            f'{val:.1f}', # Percent formatting
            ha='center',
            va='top',
            color=outline_color,
            fontsize=16,
            fontweight='bold', 
            bbox=props)
    
    ### Add text descriptions for quantiles (probably best if stat isn't a percent)
#     quant_props = dict(boxstyle='Round',
#                        facecolor='w',
#                        alpha=1, 
#                        edgecolor=outline_color,
#                        linewidth=1)
    
#     for quant in range(1,len(quantiles)-1):
#         quant_val = df[stat].quantile(quantiles[quant])
#         ax.text(quant_val,
#                 y_max*0.05,
#                 f'{quantiles[quant]:.0%}', # Percent formatting
#                 color='k',
#                 fontsize=10,
#                 fontweight=500,
#                 ha='center',
#                 bbox=quant_props
#                )
        
    ### Edit plot parameters
    ax.set(xlim=(df[stat].mean()-xlim_adj,df[stat].mean()+xlim_adj),
           ylim=(0,y_max*1.3),
           xlabel=None,
           ylabel=None,
          )
    ax.set_yticklabels([])
    ax.tick_params(left=False)
    
    # Plot title if specified
    plt.suptitle(title, size=14)
    
    # Return the plot
    sns.despine(left=True,bottom=True)

In [None]:
db_data['wstr-icr'] = db_data['strike'].sub(db_data['icr'].mul(2)).mul(100)

In [None]:
pitch_thresh = 50

graph_data = db_data.loc[db_data['year_played']>=2020].groupby(['game_played','pitchername'])[['pitch_id','strike','icr','str-icr','wstr-icr']].agg({
    'pitch_id':'count',
    'str-icr':'mean',
    'strike':'mean',
    'icr':'mean',
    'wstr-icr':'mean'
}).query(f'pitch_id >={pitch_thresh}')

shaded_kdeplot(graph_data,
               'wstr-icr',
               title=f'Weidghted Strike-ICR, per Appearance\n(2020-2023; min {pitch_thresh} pitches)')

In [None]:
graph_data.sort_values('wstr-icr')

In [None]:
pitch_thresh = 1000

graph_data = db_data.loc[db_data['year_played']>=2020].groupby(['year_played','pitchername'])[['pitch_id','str-icr','wstr-icr']].agg({
    'pitch_id':'count',
    'str-icr':'mean',
    'wstr-icr':'mean'
}).query(f'pitch_id >={pitch_thresh}')

shaded_kdeplot(graph_data,
               'wstr-icr',
               title=f'Weighted Strike-ICR, per Season\n(2020-2023; min {pitch_thresh} pitches)')

In [None]:
graph_data.sort_values('wstr-icr')