In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import sys

# reference main directory in existing folder
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

import pandas as pd 
import sys
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from src.paths import RAW_DATA_DIR, CLEANED_DATA_DIR, QUADRANT_DATA_DIR
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# import z score library
from scipy.stats import zscore

# show all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# import data
#savant = pd.read_csv(RAW_DATA_DIR / 'savant_pitch_level.csv')
#data_dictionary = pd.read_excel(RAW_DATA_DIR / 'data_dictionary.xlsx')
#fangraph = pd.read_csv(RAW_DATA_DIR / 'fangraphs_season_level.csv')
fangraph = pd.read_csv(CLEANED_DATA_DIR / 'fangraphs_engineered_merged.csv')

# Import PCA quadrant data
q1_all_pitchers = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant1_SP_RP_EQUAL.xlsx')
q2_all_pitchers = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant2_SP_MAJORITY.xlsx')
q3_all_pitchers = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant3_RP_MAJORITY.xlsx')
q4_all_pitchers = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant4_RP_MAJORITY.xlsx')

# Rename columns to 'Stats'
q1_all_pitchers.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)
q2_all_pitchers.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)
q3_all_pitchers.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)
q4_all_pitchers.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)


## Utilizing Z-Scores for Pitcher Categorization
Z-Scores are a great way to normalize these metrics, allowing us to compare pitchers across different metrics effectively. By transforming data into a uniform scale, Z-scores enable a fair assessment of a pitcher's performance relative to the average, regardless of the specific metric. This normalization is crucial because it removes the inherent biases and variances that exist in raw statistical data. Additionally, Z-scores facilitate the identification of exceptional performances, both positive and negative, by highlighting how far an individual's performance deviates from the norm. Finally, this method simplifies the process of combining multiple metrics into a single performance score, ensuring that each metric contributes equally to the final evaluation without any one metric dominating due to scale differences.

In [3]:
# define starting pitchers from Throws column
starting_pitchers = fangraph[fangraph['Role'] == 'SP']
relief_pitchers = fangraph[fangraph['Role'] == 'RP']

In [4]:
starting_pitchers.shape

(1143, 326)

In [5]:
# Import feature loadings
q1 = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant1_SP_RP_EQUAL.xlsx')
q2 = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant2_SP_MAJORITY.xlsx')
q3 = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant3_RP_MAJORITY.xlsx')
q4 = pd.read_excel(QUADRANT_DATA_DIR / 'quadrant4_RP_MAJORITY.xlsx')

# Rename columns to 'Stats'
q1.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)
q2.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)
q3.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)
q4.rename(columns={'Unnamed: 0': 'Stats'}, inplace=True)

In [6]:
import pandas as pd
from scipy.stats import zscore
from src.const import quadrant1_weights, quadrant2_weights, quadrant3_weights, quadrant4_weights

def categorize_pitchers(pitchers_df, pitcher_type, weights):
    """
    Categorizes pitchers into elite, strong, average, and suboptimal groups based on weighted z-scores of given metrics.
    
    Parameters:
    - pitchers_df (pd.DataFrame): DataFrame containing pitcher data.
    - pitcher_type (str): Type of pitcher ('SP' for starting pitchers or 'RP' for relief pitchers).
    - weights (dict): Dictionary of weights for different metrics.

    Returns:
    - Tuple of DataFrames: (elite, strong, average, suboptimal)
    """
    
    # Validate pitcher type
    pitcher_type = pitcher_type.upper()
    if pitcher_type not in ['SP', 'RP']:
        raise ValueError("pitcher_type must be 'SP' or 'RP'")

    # Check if the DataFrame is empty
    if pitchers_df.empty:
        raise ValueError("The provided DataFrame is empty.")
    
    # Check if the 'IP' column exists
    if 'IP' not in pitchers_df.columns:
        raise ValueError("'IP' column not found in DataFrame.")

    # Filter out pitchers below the mean IP
    mean_ip = pitchers_df['IP'].mean()
    pitchers_df = pitchers_df[pitchers_df['IP'] >= mean_ip]

    # Prepare for z-score calculation
    metric_cols = [col for col in weights.keys() if col in pitchers_df.columns]
    missing_cols = set(weights.keys()) - set(metric_cols)
    for col in missing_cols:
        print(f"Warning: Column {col} not found in DataFrame.")

    # Standardize the metric columns
    standard_scaler = StandardScaler()
    pitchers_df[metric_cols] = standard_scaler.fit_transform(pitchers_df[metric_cols])

    # Calculate z-scores for existing metrics
    z_scores = pitchers_df[metric_cols].apply(zscore)

    # Calculate weighted scores
    for col in metric_cols:
        z_scores[col] *= weights[col]
    pitchers_df['z_total_score'] = z_scores.sum(axis=1)

    # Define thresholds for categorization
    elite_threshold = pitchers_df['z_total_score'].quantile(.70)
    strong_threshold = pitchers_df['z_total_score'].quantile(.40)
    average_threshold = pitchers_df['z_total_score'].quantile(.10)

    # Categorize pitchers
    elite = pitchers_df[pitchers_df['z_total_score'] >= elite_threshold]
    strong = pitchers_df[(pitchers_df['z_total_score'] < elite_threshold) & (pitchers_df['z_total_score'] >= strong_threshold)]
    average = pitchers_df[(pitchers_df['z_total_score'] < strong_threshold) & (pitchers_df['z_total_score'] >= average_threshold)]
    suboptimal = pitchers_df[pitchers_df['z_total_score'] < average_threshold]

    return elite, strong, average, suboptimal

concatenated_weights = {**quadrant1_weights, **quadrant2_weights, **quadrant3_weights, **quadrant4_weights}


elite_sp, strong_sp, average_sp, suboptimal_sp = categorize_pitchers(starting_pitchers, 'SP', concatenated_weights)
elite_rp, strong_rp, average_rp, suboptimal_rp = categorize_pitchers(relief_pitchers, 'RP', concatenated_weights)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitchers_df[metric_cols] = standard_scaler.fit_transform(pitchers_df[metric_cols])
  pitchers_df['z_total_score'] = z_scores.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitchers_df['z_total_score'] = z_scores.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitchers_df[metr

In [7]:
"""# sort by year and total score
elite_sp = elite_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
strong_sp = strong_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
average_sp = average_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
suboptimal_sp = suboptimal_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])

elite_rp = elite_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
strong_rp = strong_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
average_rp = average_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
suboptimal_rp = suboptimal_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])

# Create a unique id for each pitcher
elite_sp['id'] = elite_sp['Name'] + elite_sp['Season'].astype(str)
strong_sp['id'] = strong_sp['Name'] + strong_sp['Season'].astype(str)
average_sp['id'] = average_sp['Name'] + average_sp['Season'].astype(str)
suboptimal_sp['id'] = suboptimal_sp['Name'] + suboptimal_sp['Season'].astype(str)

elite_rp['id'] = elite_rp['Name'] + elite_rp['Season'].astype(str)
strong_rp['id'] = strong_rp['Name'] + strong_rp['Season'].astype(str)
average_rp['id'] = average_rp['Name'] + average_rp['Season'].astype(str)
suboptimal_rp['id'] = suboptimal_rp['Name'] + suboptimal_rp['Season'].astype(str)
"""


"# sort by year and total score\nelite_sp = elite_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\nstrong_sp = strong_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\naverage_sp = average_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\nsuboptimal_sp = suboptimal_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\n\nelite_rp = elite_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\nstrong_rp = strong_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\naverage_rp = average_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\nsuboptimal_rp = suboptimal_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])\n\n# Create a unique id for each pitcher\nelite_sp['id'] = elite_sp['Name'] + elite_sp['Season'].astype(str)\nstrong_sp['id'] = strong_sp['Name'] + strong_sp['Season'].astype(str)\naverage_sp['id'] = average_sp['Name'] + average_s

In [8]:
fangraph['id'] = fangraph['Name'] + fangraph['Season'].astype(str)

In [9]:
fangraph['Classification'] = 'unclassified'
fangraph['z_total_score'] = float('nan')

In [10]:
fangraph.head()

Unnamed: 0,PlayerId,pitcher,Name,NameASCII,Throws,Season,Age,Team,Role,G,GS,IP,TBF,W,L,CG,ShO,SV,BS,HLD,SD,MD,Pulls,ERA,R,ER,H,HR,SO,BB,IBB,HBP,WP,BK,Events,GB,LD,FB,IFFB,BU,IFH,BUH,Balls,Strikes,Pitches,RS,RS_per_9,K_pct,BB_pct,K_minus_BB_pct,K_per_9,BB_per_9,K_to_BB,H_per_9,HR_per_9,AVG,WHIP,BABIP,LOB_pct,GB_pct,LD_pct,FB_pct,IFFB_pct,GB_to_FB,HR_to_FB,IFH_pct,BUH_pct,FIP,xFIP,SIERA,tERA,kwERA,E_minus_F,RAR,WAR,Dollars,WPA,RE24,REW,pLI,inLI,gmLI,exLI,WPA_to_LI,Clutch,OSwing_pct,ZSwing_pct,Swing_pct,OContact_pct,ZContact_pct,Contact_pct,Zone_pct,FStrike_pct,SwStr_pct,CStr_pct,CSW_pct,ERA_minus,FIP_minus,xFIP_minus,Pace,RA9_WAR,BIP_Wins,LOB_Wins,FDP_Wins,Pull_pct,Cent_pct,Oppo_pct,Soft_pct,Med_pct,Hard_pct,TTO_pct,FRM,K_pct_plus,BB_pct_plus,K_per_9_plus,BB_per_9_plus,K_to_BB_plus,H_per_9_plus,HR_per_9_plus,AVG_plus,WHIP_plus,BABIP_plus,LOB_pct_plus,GB_pct_plus,LD_pct_plus,FB_pct_plus,HR_to_FB_pct_plus,Pull_pct_plus,Cent_pct_plus,Oppo_pct_plus,Med_pct_plus,Hard_pct_plus,EV,LA,Barrels,Barrel_pct,maxEV,HardHit,HardHit_pct,FA_pct,FAv,SL_pct,SLv,CT_pct,CTv,CB_pct,CBv,CH_pct,CHv,SF_pct,SFv,KN_pct,KNv,XX_pct,PO_pct,wFB,wSL,wCT,wCB,wCH,wSF,wKN,wFB_per_c,wSL_per_c,wCT_per_c,wCB_per_c,wCH_per_c,wSF_per_c,wKN_per_c,FA_pct_sc,FC_pct_sc,FS_pct_sc,FO_pct_sc,SI_pct_sc,SL_pct_sc,CU_pct_sc,KC_pct_sc,EP_pct_sc,CH_pct_sc,SC_pct_sc,KN_pct_sc,UN_pct_sc,vFA_sc,vFC_sc,vFS_sc,vFO_sc,vSI_sc,vSL_sc,vCU_sc,vKC_sc,vEP_sc,vCH_sc,vSC_sc,vKN_sc,FA_X_sc,FC_X_sc,FS_X_sc,FO_X_sc,SI_X_sc,SL_X_sc,CU_X_sc,KC_X_sc,EP_X_sc,CH_X_sc,SC_X_sc,KN_X_sc,FA_Z_sc,FC_Z_sc,FS_Z_sc,FO_Z_sc,SI_Z_sc,SL_Z_sc,CU_Z_sc,KC_Z_sc,EP_Z_sc,CH_Z_sc,SC_Z_sc,KN_Z_sc,wFA_sc,wFC_sc,wFS_sc,wFO_sc,wSI_sc,wSL_sc,wCU_sc,wKC_sc,wEP_sc,wCH_sc,wSC_sc,wKN_sc,wFA_per_c_sc,wFC_per_c_sc,wFS_per_c_sc,wFO_per_c_sc,wSI_per_c_sc,wSL_per_c_sc,wCU_per_c_sc,wKC_per_c_sc,wEP_per_c_sc,wCH_per_c_sc,wSC_per_c_sc,wKN_per_c_sc,OSwing_pct_sc,ZSwing_pct_sc,Swing_pct_sc,OContact_pct_sc,ZContact_pct_sc,Contact_pct_sc,Zone_pct_sc,botERA,botOvr_CH,botStf_CH,botCmd_CH,botOvr_CU,botStf_CU,botCmd_CU,botOvr_FA,botStf_FA,botCmd_FA,botOvr_SI,botStf_SI,botCmd_SI,botOvr_SL,botStf_SL,botCmd_SL,botOvr_KC,botStf_KC,botCmd_KC,botOvr_FC,botStf_FC,botCmd_FC,botOvr_FS,botStf_FS,botCmd_FS,botOvr,botStf,botCmd,botxRV100,Stf_plus_CH,Loc_plus_CH,Pit_plus_CH,Stf_plus_CU,Loc_plus_CU,Pit_plus_CU,Stf_plus_FA,Loc_plus_FA,Pit_plus_FA,Stf_plus_SI,Loc_plus_SI,Pit_plus_SI,Stf_plus_SL,Loc_plus_SL,Pit_plus_SL,Stf_plus_KC,Loc_plus_KC,Pit_plus_KC,Stf_plus_FC,Loc_plus_FC,Pit_plus_FC,Stf_plus_FS,Loc_plus_FS,Pit_plus_FS,Stf_plus_FO,Loc_plus_FO,Pit_plus_FO,Stuff_plus,Location_plus,Pitching_plus,player_name,most_common_inning_SP,most_common_inning_RP,average_batters_faced_per_appearance_SP,average_innings_pitched_per_appearance_SP,average_batters_faced_per_appearance_RP,average_innings_pitched_per_appearance_RP,average_inning_SP,average_inning_RP,avg_inning_entry_SP,avg_inning_exit_SP,avg_inning_entry_RP,avg_inning_exit_RP,bad_walk_rate_SP,leadoff_walk_rate_SP,bad_walk_rate_RP,leadoff_walk_rate_RP,classified_role,id,Classification,z_total_score
0,18,491703,NeftalA­ Feliz,Neftali Feliz,R,2021,33,- - -,RP,5,0,4.0,19,0,1,0,0,0,1,0,0,2,4,9.000002,4,4,5,1,3,1,0,1,0,0,14,5,4,5,1,0,0,0,20,47,67,0,0.0,0.1579,0.0526,0.1053,6.750002,2.250001,3.0,11.250003,2.250001,0.294118,1.5,0.307692,0.5357,0.357143,0.285714,0.357143,0.2,1.0,0.2,0.0,0.0,6.420031,5.375481,4.519107,7.421088,4.889752,2.579971,-1.885527,-0.17499,-1.39992,-0.875182,-4.499,-0.420531,1.072752,0.60916,1.06128,0.26395,-0.137091,-0.678738,0.5,0.7619,0.5821,0.7391,0.9375,0.8205,0.3134,0.6842,0.1045,0.1194,0.2239,213.324493,148.609662,126.817213,25.587,-0.395077,-0.001708,-0.218379,-0.220087,0.4286,0.4286,0.1429,0.2143,0.5,0.2857,0.2632,0.0,67.733131,59.330599,75.367393,66.017851,76.108133,136.696882,183.26212,123.424821,115.998542,107.010757,74.114072,82.038249,137.175994,100.214061,145.942786,107.747591,121.632996,57.166409,97.430746,88.985942,84.847499,15.615007,3,0.2143,108.634,4,0.2857,0.7015,95.0851,0.0746,82.8,0.1045,85.7143,,,0.1194,87.0,,,,,,,-0.57793,-0.395471,0.447244,,-1.042168,,,-1.229638,-7.90942,6.3892,,-13.0271,,,0.7015,,,,,0.1791,,,,0.1194,,,,94.989364,,,,,84.35,,,,86.887501,,,-7.174255,,,,,2.258333,,,,-8.9075,,,9.762127,,,,,2.4075,,,,5.435,,,-0.594629,,,,,0.041559,,,,-1.037477,,,-1.265168,,,,,0.346322,,,,-12.968469,,,0.375,0.7714,0.5821,0.8333,0.8148,0.8205,0.5224,5.299472,65.310105,45.922311,66.765125,,,,37.391871,49.884001,34.194912,,,,50.821306,43.923645,53.032684,,,,,,,,,,41.998539,46.695011,43.20721,0.690299,83.1125,116.225,115.0375,,,,106.865957,90.140426,90.834043,,,,108.466667,100.575,102.833333,,,,,,,,,,,,,104.316418,95.123881,95.873134,Neftali Feliz,,7.0,0.0,0.0,3.8,1.0,,7.19403,,,7.2,7.2,,,0.0,0.0,Setup Pitcher,NeftalA­ Feliz2021,unclassified,
1,1157,488984,Tommy Hunter,Tommy Hunter,R,2021,34,NYM,RP,3,0,6.0,26,0,0,0,0,0,0,0,1,0,3,0.0,0,0,4,0,5,3,0,1,1,0,17,7,5,4,1,1,0,0,34,58,92,4,6.0,0.1923,0.1154,0.0769,7.5,4.5,1.6667,6.0,0.0,0.181818,1.166667,0.235294,1.0,0.4375,0.3125,0.25,0.25,1.75,0.0,0.0,0.0,3.503364,4.679604,4.692691,4.712296,5.229833,-3.503364,0.56693,0.057611,0.460886,0.200408,2.882,0.303443,0.715007,0.599233,0.602467,0.720867,0.141754,0.138534,0.3269,0.675,0.4783,0.7647,0.963,0.8864,0.4348,0.6923,0.0543,0.1413,0.1957,0.0,83.446871,110.270416,22.4127,0.276844,0.077289,0.141945,0.219233,0.5294,0.1765,0.2941,0.1765,0.5882,0.2353,0.3077,0.045,82.49548,130.070928,83.741526,132.035668,63.423444,72.904985,0.0,76.29898,90.221066,81.831755,138.346268,100.496855,150.036244,70.149843,0.0,133.099965,50.084175,117.695548,114.624408,73.28254,87.655624,10.628437,1,0.0588,106.499,3,0.1765,0.413,92.3947,,,0.4239,89.7692,0.163,83.2,,,,,,,,,1.43093,,-0.135306,-0.259368,,,,3.765605,,-0.346938,-1.72912,,,,0.0652,0.4239,,,0.3478,,0.163,,,,,,,93.116669,89.712822,,,92.159378,,83.166667,,,,,,-4.543333,1.561282,,,-7.43375,,6.086667,,,,,,9.463333,6.312051,,,7.821875,,-2.172,,,,,,0.487117,-0.133101,,,0.912673,,-0.255797,,,,,,8.118621,-0.341285,,,2.852103,,-1.705311,,,,,,0.2857,0.6977,0.4783,0.7143,0.9667,0.8864,0.4674,4.86869,,,,37.529548,52.418043,28.792931,80.0,28.134397,80.0,44.474413,32.160524,57.734526,,,,,,,44.556693,51.760282,29.303649,,,,46.471041,42.106337,42.211218,0.403113,,,,98.68,80.52,83.893333,76.933333,118.766667,117.133333,66.08125,90.26875,87.01875,,,,,,,100.992308,101.076923,103.1,,,,,,,86.285556,94.884444,95.112222,Tommy Hunter,1.0,6.0,7.0,2.0,8.0,2.0,1.470588,5.021739,1.0,2.0,4.333333,5.333333,0.0,0.0,0.0,0.0,Middle Reliever,Tommy Hunter2021,unclassified,
2,1246,456713,Matt Bush,Matt Bush,R,2021,35,TEX,RP,4,0,4.0,17,0,0,0,0,0,0,2,0,0,3,6.75,3,3,4,3,5,1,0,0,0,0,11,3,0,8,0,0,1,0,28,41,69,1,2.25,0.2941,0.0588,0.2353,11.25,2.25,5.0,9.0,6.75,0.25,1.25,0.125,1.0,0.272727,0.0,0.727273,0.0,0.375,0.375,0.333333,0.0,11.17003,4.94875,3.579965,9.213614,3.329381,-4.42003,-2.37387,-0.224945,-1.79956,0.018444,-0.917,-0.090483,0.742871,0.6165,0.6165,1.23867,-0.151255,0.176083,0.3043,0.6522,0.4203,0.5,0.8667,0.6897,0.3333,0.5294,0.1304,0.1739,0.3043,152.453755,252.136193,115.365787,20.75,-0.063366,0.099915,0.061665,0.16158,0.6364,0.0,0.3636,0.1818,0.3636,0.4545,0.5294,0.06,127.605936,69.178598,127.182313,68.948823,184.458967,106.544183,526.19272,102.688806,96.068125,42.866351,139.035647,64.556792,0.0,195.158745,278.841493,158.422763,0.0,144.946078,70.597484,141.414392,90.590271,19.960727,2,0.1818,108.558,4,0.3636,0.5362,95.2432,0.2899,86.3,,,0.1739,78.9167,,,,,,,,,-1.17706,-0.841425,,0.149604,,,,-3.181243,-4.207125,,1.2467,,,,0.5362,,,,,0.2899,0.1739,,,,,,,95.148649,,,,,86.140002,78.916667,,,,,,-4.331351,,,,,3.207,6.499166,,,,,,11.064865,,,,,1.112,-6.1675,,,,,,-1.135764,,,,,-0.841986,0.149768,,,,,,-3.069632,,,,,-4.209929,1.248063,,,,,,0.3095,0.5926,0.4203,0.4615,0.875,0.6897,0.3913,4.203017,,,,65.676804,43.547036,68.670525,56.85852,60.852911,47.748304,,,,37.270995,56.392415,28.254164,,,,,,,,,,53.382242,55.093864,46.290163,-0.04067,,,,116.85,116.066667,111.291667,124.778378,97.17027,105.164865,,,,129.79,89.87,92.68,,,,,,,,,,,,,124.852174,98.34058,102.611594,Matt Bush,,8.0,0.0,0.0,4.25,1.0,,7.623188,,,7.5,7.5,,,0.0,0.0,Setup Pitcher,Matt Bush2021,unclassified,
3,1247,493603,Adam Ottavino,Adam Ottavino,R,2021,35,BOS,RP,69,0,62.0,276,7,3,0,0,11,6,22,35,12,54,4.209677,31,29,55,5,71,35,2,7,4,0,163,64,33,63,6,3,8,1,441,705,1146,36,5.225806,0.2572,0.1268,0.1304,10.306452,5.080645,2.0286,7.983871,0.725806,0.235043,1.451613,0.316456,0.7333,0.4,0.20625,0.39375,0.095238,1.015873,0.079365,0.125,0.333333,3.960353,4.704783,4.232883,4.405717,4.587693,0.249325,5.72853,0.575616,4.60492,2.0225,6.566,0.662273,1.81092,1.46913,1.67914,1.47384,0.271568,0.845268,0.2589,0.6106,0.4031,0.56,0.8432,0.7359,0.4101,0.5543,0.1065,0.2103,0.3168,91.757912,91.912456,109.726285,25.7287,0.65277,-0.315426,0.392581,0.077155,0.3742,0.3681,0.2577,0.2086,0.5583,0.2331,0.4022,0.01,111.60896,149.135022,116.515409,155.690891,74.837638,94.515001,56.579862,96.545031,111.562984,108.522408,101.959474,94.683296,100.667703,105.660164,59.014072,93.165358,105.947037,102.70719,108.386628,72.529099,86.054999,11.843938,8,0.0491,118.388,48,0.2945,0.4843,95.0072,0.4843,80.6126,0.0175,87.2,,,0.014,88.0625,,,,,0.0035,,1.55849,-1.06935,-1.54758,,0.178853,,,0.280809,-0.192676,-7.7379,,1.117831,,,0.2063,0.0174,0.0017,,0.2776,0.4613,0.0218,,,0.0139,,,,95.467085,87.184998,93.150002,,94.584639,80.634154,78.816001,,,88.018753,,,-3.731477,1.5095,-7.995,,-9.394483,10.013056,9.1448,,,-9.5075,,,8.39481,3.8445,3.665,,4.19536,-0.336358,-4.8584,,,1.631875,,,4.764778,-1.536283,-0.133056,,-2.947103,-1.6067,0.543392,,,0.179081,,,2.010455,-7.681416,-6.652782,,-0.923857,-0.303151,2.173569,,,1.119256,,,0.2143,0.5806,0.4021,0.4667,0.8275,0.7338,0.5126,3.760607,33.992684,57.361722,23.745311,29.040932,60.903863,23.285248,52.565306,64.096603,43.878738,50.630324,63.112632,48.859646,63.3282,65.8673,51.180623,,,,25.858131,60.704361,25.532024,20.0,51.80598,20.0,57.975465,67.552835,48.063697,-0.33561,94.6,76.68125,83.01875,109.790909,85.509091,93.295455,110.75443,99.293671,99.5,104.252351,97.196552,98.21442,129.091132,92.866981,107.81283,,,,54.627778,72.25,66.583333,,,,,,,116.269527,94.625394,102.238617,Adam Ottavino,,8.0,0.0,0.0,4.028986,1.130435,,7.979058,,,7.855072,7.985507,,,0.0,0.0,Setup Pitcher,Adam Ottavino2021,unclassified,
4,1943,425844,Zack Greinke,Zack Greinke,R,2021,37,HOU,RP,1,0,2.1,9,0,0,0,0,0,0,1,1,0,1,7.714297,2,2,2,1,3,0,0,0,0,0,6,4,1,1,0,0,0,0,9,19,28,2,7.714297,0.3333,0.0,0.3333,11.571446,0.0,3.0,7.714297,3.857149,0.222222,0.857144,0.2,0.0,0.666667,0.166667,0.166667,0.0,4.0,1.0,0.0,0.0,6.170035,1.354753,1.353345,6.182322,2.15291,1.544262,-0.672134,-0.064591,-0.516731,0.11346,-0.77,-0.079218,1.31551,1.30787,1.73,2.1148,0.067716,0.018532,0.375,0.8333,0.5714,0.3333,0.9,0.6875,0.4286,0.5556,0.1786,0.1071,0.2857,179.886308,145.603051,32.287565,25.2105,-0.093767,0.034719,-0.063894,-0.029175,0.5,0.1667,0.3333,0.0,0.6667,0.3333,0.4444,0.00232,144.620061,0.0,130.816285,0.0,,91.323719,300.681995,91.278939,65.875383,68.586162,0.0,157.805492,81.347639,44.723879,743.577316,124.475028,47.970464,132.867239,129.42872,103.703888,96.761332,1.550577,0,0.0,103.903,4,0.6667,0.3214,89.2222,0.1429,85.25,,,0.0714,69.0,0.4643,86.6923,,,,,,,-0.02073,0.206203,,-0.086155,-0.289702,,,-0.230336,5.155075,,-4.30777,-2.228477,,,0.2857,,,,0.0357,0.1429,0.0714,,,0.4643,,,,89.037498,,,,91.400002,85.275002,68.900002,,,86.653846,,,-1.36375,,,,-7.74,3.925,6.2,,,-7.57,,,10.6825,,,,5.78,3.755,-6.93,,,2.31,,,-0.020151,,,,0.0,0.205336,-0.085507,,,-0.26333,,,-0.251883,,,,0.0,5.133402,-4.275334,,,-2.025618,,,0.2727,0.7647,0.5714,0.0,0.8462,0.6875,0.6071,3.61824,47.81874,53.05141,37.92299,62.09333,31.20845,69.99561,80.0,38.51428,80.0,80.0,25.64894,80.0,20.0,23.11389,29.84702,,,,,,,,,,59.45348,42.84874,62.58997,-0.43052,165.5,87.8,96.2,65.3,111.9,114.5,73.1,124.4,118.8,90.2,167.1,160.0,112.3,86.7,93.5,,,,,,,,,,,,,121.653571,102.653571,105.857143,Zack Greinke,2.0,7.0,9.172414,6.068966,9.0,3.0,3.564927,7.75,1.0,6.068966,7.0,9.0,0.0,0.001451,0.0,0.0,Setup Pitcher,Zack Greinke2021,unclassified,


In [11]:
import pandas as pd
from scipy.stats import zscore
from src.const import quadrant1_weights, quadrant2_weights, quadrant3_weights, quadrant4_weights

def categorize_pitchers(pitchers_df, pitcher_type, weights):
    """
    Categorizes pitchers into elite, strong, average, and suboptimal groups based on weighted z-scores of given metrics.
    
    Parameters:
    - pitchers_df (pd.DataFrame): DataFrame containing pitcher data.
    - pitcher_type (str): Type of pitcher ('SP' for starting pitchers or 'RP' for relief pitchers).
    - weights (dict): Dictionary of weights for different metrics.

    Returns:
    - Tuple of DataFrames: (elite, strong, average, suboptimal)
    """
    
    # Validate pitcher type
    pitcher_type = pitcher_type.upper()
    if pitcher_type not in ['SP', 'RP']:
        raise ValueError("pitcher_type must be 'SP' or 'RP'")

    # Check if the DataFrame is empty
    if pitchers_df.empty:
        raise ValueError("The provided DataFrame is empty.")
    
    # Check if the 'IP' column exists
    if 'IP' not in pitchers_df.columns:
        raise ValueError("'IP' column not found in DataFrame.")

    # Filter out pitchers below the mean IP
    #mean_ip = pitchers_df['IP'].mean()
    #pitchers_df = pitchers_df[pitchers_df['IP'] >= mean_ip]

    # Prepare for z-score calculation
    metric_cols = [col for col in weights.keys() if col in pitchers_df.columns]
    missing_cols = set(weights.keys()) - set(metric_cols)
    for col in missing_cols:
        print(f"Warning: Column {col} not found in DataFrame.")

    # Standardize the metric columns
    standard_scaler = StandardScaler()
    pitchers_df[metric_cols] = standard_scaler.fit_transform(pitchers_df[metric_cols])

    # Calculate z-scores for existing metrics
    z_scores = pitchers_df[metric_cols].apply(zscore)

    # Calculate weighted scores
    for col in metric_cols:
        z_scores[col] *= weights[col]
    pitchers_df['z_total_score'] = z_scores.sum(axis=1)

    # Define thresholds for categorization
    elite_threshold = pitchers_df['z_total_score'].quantile(.70)
    strong_threshold = pitchers_df['z_total_score'].quantile(.40)
    average_threshold = pitchers_df['z_total_score'].quantile(.10)

    # Categorize pitchers
    elite = pitchers_df[pitchers_df['z_total_score'] >= elite_threshold]
    strong = pitchers_df[(pitchers_df['z_total_score'] < elite_threshold) & (pitchers_df['z_total_score'] >= strong_threshold)]
    average = pitchers_df[(pitchers_df['z_total_score'] < strong_threshold) & (pitchers_df['z_total_score'] >= average_threshold)]
    suboptimal = pitchers_df[pitchers_df['z_total_score'] < average_threshold]

    return elite, strong, average, suboptimal

concatenated_weights = {**quadrant1_weights, **quadrant2_weights, **quadrant3_weights, **quadrant4_weights}


elite_sp, strong_sp, average_sp, suboptimal_sp = categorize_pitchers(starting_pitchers, 'SP', concatenated_weights)
elite_rp, strong_rp, average_rp, suboptimal_rp = categorize_pitchers(relief_pitchers, 'RP', concatenated_weights)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitchers_df[metric_cols] = standard_scaler.fit_transform(pitchers_df[metric_cols])


  pitchers_df['z_total_score'] = z_scores.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitchers_df['z_total_score'] = z_scores.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitchers_df[metric_cols] = standard_scaler.fit_transform(pitchers_df[metric_cols])
  pitchers_df['z_total_score'] = z_scores.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

In [12]:
# sort by year and total score
elite_sp = elite_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
strong_sp = strong_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
average_sp = average_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
suboptimal_sp = suboptimal_sp.sort_values(['Season', 'z_total_score'], ascending=[False, False])

elite_rp = elite_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
strong_rp = strong_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
average_rp = average_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])
suboptimal_rp = suboptimal_rp.sort_values(['Season', 'z_total_score'], ascending=[False, False])

# Create a unique id for each pitcher
elite_sp['id'] = elite_sp['Name'] + elite_sp['Season'].astype(str)
strong_sp['id'] = strong_sp['Name'] + strong_sp['Season'].astype(str)
average_sp['id'] = average_sp['Name'] + average_sp['Season'].astype(str)
suboptimal_sp['id'] = suboptimal_sp['Name'] + suboptimal_sp['Season'].astype(str)

elite_rp['id'] = elite_rp['Name'] + elite_rp['Season'].astype(str)
strong_rp['id'] = strong_rp['Name'] + strong_rp['Season'].astype(str)
average_rp['id'] = average_rp['Name'] + average_rp['Season'].astype(str)
suboptimal_rp['id'] = suboptimal_rp['Name'] + suboptimal_rp['Season'].astype(str)


  elite_sp['id'] = elite_sp['Name'] + elite_sp['Season'].astype(str)
  strong_sp['id'] = strong_sp['Name'] + strong_sp['Season'].astype(str)
  average_sp['id'] = average_sp['Name'] + average_sp['Season'].astype(str)
  suboptimal_sp['id'] = suboptimal_sp['Name'] + suboptimal_sp['Season'].astype(str)
  elite_rp['id'] = elite_rp['Name'] + elite_rp['Season'].astype(str)
  strong_rp['id'] = strong_rp['Name'] + strong_rp['Season'].astype(str)
  average_rp['id'] = average_rp['Name'] + average_rp['Season'].astype(str)
  suboptimal_rp['id'] = suboptimal_rp['Name'] + suboptimal_rp['Season'].astype(str)


In [13]:
# Helper function to update fangraph DataFrame
def update_fangraph_classification(df, classification):
    # Create a temporary dictionary from the categorized DataFrame
    temp_dict = df.set_index('id')['z_total_score'].to_dict()
    
    # Update fangraph based on the unique id
    for pid, score in temp_dict.items():
        if pid in fangraph['id'].values:
            fangraph.loc[fangraph['id'] == pid, 'Classification'] = classification
            fangraph.loc[fangraph['id'] == pid, 'z_total_score'] = score

# Step 4: Update the fangraph DataFrame for each group
update_fangraph_classification(elite_sp, 'elite_sp')
update_fangraph_classification(strong_sp, 'strong_sp')
update_fangraph_classification(average_sp, 'average_sp')
update_fangraph_classification(suboptimal_sp, 'suboptimal_sp')

update_fangraph_classification(elite_rp, 'elite_rp')
update_fangraph_classification(strong_rp, 'strong_rp')
update_fangraph_classification(average_rp, 'average_rp')
update_fangraph_classification(suboptimal_rp, 'suboptimal_rp')




In [14]:
fangraph.isnull().sum()

PlayerId                                        0
pitcher                                         0
Name                                            0
NameASCII                                       0
Throws                                          0
Season                                          0
Age                                             0
Team                                            0
Role                                            0
G                                               0
GS                                              0
IP                                              0
TBF                                             0
W                                               0
L                                               0
CG                                              0
ShO                                             0
SV                                              0
BS                                              0
HLD                                             0


In [15]:
# save the fangraph DataFrame
fangraph.to_csv(CLEANED_DATA_DIR / 'fangraphs_zscores_merged.csv', index=False)