In [28]:
# Imports and File Paths
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px

INPUT_FILE = "../MLBDATA/Raw/BatterData/BattingData2015-2024NoCovid.csv"


In [29]:
# Cell 3: Load and Initial Cleanup
df = pd.read_csv(INPUT_FILE)

print(df.columns)

# Drop unwanted outcome/stat columns
to_drop = ['player_age', 'r_stolen_base_pct', 'avg_swing_speed', 'fast_swing_rate', 'blasts_contact', 
           'blasts_swing', 'squared_up_contact', 'squared_up_swing', 'avg_swing_length', 'swords', 'ab', 
           'pa', 'hit', 'single', 'double', 'triple', 'home_run', 'strikeout', 'walk', 'batted_ball']
df.drop(columns=to_drop, inplace=True, errors='ignore')

# Filter to pre-ban seasons 2016–2022
df = df[df['year'].between(2016, 2022)].copy()

df.drop(columns=['year'], inplace=True)

# Remove any rows with missing feature values
initialDFClean = df.copy()

initialDFClean.head()

Index(['last_name, first_name', 'player_id', 'year', 'player_age', 'ab', 'pa',
       'hit', 'single', 'double', 'triple', 'home_run', 'strikeout', 'walk',
       'k_percent', 'bb_percent', 'batting_avg', 'slg_percent',
       'on_base_percent', 'on_base_plus_slg', 'isolated_power', 'babip',
       'b_rbi', 'xba', 'xslg', 'woba', 'xwoba', 'xobp', 'xiso',
       'avg_swing_speed', 'fast_swing_rate', 'blasts_contact', 'blasts_swing',
       'squared_up_contact', 'squared_up_swing', 'avg_swing_length', 'swords',
       'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent',
       'barrel_batted_rate', 'solidcontact_percent', 'flareburner_percent',
       'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent',
       'hard_hit_percent', 'whiff_percent', 'pull_percent',
       'straightaway_percent', 'opposite_percent', 'groundballs_percent',
       'flyballs_percent', 'linedrives_percent', 'popups_percent',
       'sprint_speed'],
      dtype='object')


Unnamed: 0,"last_name, first_name",player_id,k_percent,bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,babip,...,hard_hit_percent,whiff_percent,pull_percent,straightaway_percent,opposite_percent,groundballs_percent,flyballs_percent,linedrives_percent,popups_percent,sprint_speed
311,"Ortiz, David",120074,13.7,12.8,0.315,0.62,0.401,1.021,0.305,0.312,...,47.2,19.4,47.6,35.8,16.6,33.8,28.4,29.0,8.7,23.6
312,"Beltré, Adrian",134181,10.3,7.5,0.3,0.521,0.358,0.879,0.221,0.293,...,40.4,15.2,36.9,37.5,25.6,41.2,24.0,25.2,9.6,25.9
313,"Beltrán, Carlos",136860,17.0,5.9,0.295,0.513,0.337,0.85,0.218,0.315,...,38.5,19.6,43.7,34.1,22.2,42.4,22.2,26.6,8.8,25.7
314,"Werth, Jayson",150029,22.9,11.7,0.244,0.417,0.335,0.752,0.173,0.288,...,44.6,20.9,41.1,35.7,23.2,41.3,26.5,25.3,6.9,25.8
315,"Pierzynski, A.J.",150229,11.2,2.3,0.219,0.304,0.243,0.547,0.085,0.237,...,33.9,18.3,33.0,41.2,25.8,51.6,16.7,26.7,5.0,24.5


In [30]:
# Compute Player-Level Averages (2016–2022)

# List of features (all columns except the two ID columns)
feature_cols = [c for c in initialDFClean.columns
                if c not in ('player_id', 'last_name, first_name',)]

player_feats = (
    initialDFClean
      .groupby(['player_id', 'last_name, first_name'])[feature_cols]
      .mean()
      .reset_index()
)

# Inspect
player_feats.head()


Unnamed: 0,player_id,"last_name, first_name",k_percent,bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,babip,...,hard_hit_percent,whiff_percent,pull_percent,straightaway_percent,opposite_percent,groundballs_percent,flyballs_percent,linedrives_percent,popups_percent,sprint_speed
0,120074,"Ortiz, David",13.7,12.8,0.315,0.62,0.401,1.021,0.305,0.312,...,47.2,19.4,47.6,35.8,16.6,33.8,28.4,29.0,8.7,23.6
1,134181,"Beltré, Adrian",14.566667,8.2,0.295,0.495667,0.356333,0.852,0.200667,0.308667,...,36.766667,19.1,35.233333,38.166667,26.2,41.166667,24.666667,27.166667,6.966667,25.0
2,136860,"Beltrán, Carlos",18.5,6.2,0.263,0.448,0.31,0.758,0.185,0.289,...,34.75,19.7,42.75,35.5,21.5,43.15,23.25,24.75,8.85,25.5
3,150029,"Werth, Jayson",23.4,11.9,0.235,0.405,0.3285,0.7335,0.17,0.279,...,42.65,21.4,40.4,36.35,23.3,40.75,24.65,25.4,9.15,25.7
4,150229,"Pierzynski, A.J.",11.2,2.3,0.219,0.304,0.243,0.547,0.085,0.237,...,33.9,18.3,33.0,41.2,25.8,51.6,16.7,26.7,5.0,24.5


In [31]:
# Standardize & PCA
# Standardize features to zero mean/unit variance
id_cols      = ['player_id', 'last_name, first_name']
feature_cols = [c for c in player_feats.columns if c not in id_cols]

X = player_feats[feature_cols].copy()  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA: keep enough components to explain 80% of variance
pca = PCA(n_components=0.80, random_state=42)
X_pca_full = pca.fit_transform(X_scaled)

# Print explained variance per PC
explained = pd.Series(
    pca.explained_variance_ratio_,
    index=[f'PC{i+1}' for i in range(pca.n_components_)]
)
print("Explained variance per PC:\n", explained)


Explained variance per PC:
 PC1    0.343576
PC2    0.222845
PC3    0.106312
PC4    0.064414
PC5    0.050566
PC6    0.044030
dtype: float64


In [32]:
# Examine PCA Loadings
loadings = pd.DataFrame(
    pca.components_.T,
    index=feature_cols,
    columns=explained.index
)

# Print top positive and negative loadings for PCs 1–3
for pc in explained.index[:3]:
    series = loadings[pc]
    top_pos = series.sort_values(ascending=False).head(5)
    top_neg = series.sort_values(ascending=True).head(5)
    print(f"\n{pc} — Top 5 Positive Loadings:\n{top_pos}")
    print(f"\n{pc} — Top 5 Negative Loadings:\n{top_neg}")



PC1 — Top 5 Positive Loadings:
xiso              0.273643
xslg              0.270155
isolated_power    0.262971
slg_percent       0.257939
xwoba             0.256710
Name: PC1, dtype: float64

PC1 — Top 5 Negative Loadings:
poorlytopped_percent   -0.173069
groundballs_percent    -0.169813
poorlyweak_percent     -0.127403
opposite_percent       -0.104301
straightaway_percent   -0.070418
Name: PC1, dtype: float64

PC2 — Top 5 Positive Loadings:
batting_avg            0.292219
xba                    0.279945
babip                  0.241115
on_base_percent        0.226275
groundballs_percent    0.214783
Name: PC2, dtype: float64

PC2 — Top 5 Negative Loadings:
poorlyunder_percent   -0.266326
launch_angle_avg      -0.257847
popups_percent        -0.247618
pull_percent          -0.235216
flyballs_percent      -0.222724
Name: PC2, dtype: float64

PC3 — Top 5 Positive Loadings:
whiff_percent          0.361030
k_percent              0.330301
hard_hit_percent       0.263418
groundballs_percent 

In [33]:
# Determine Optimal K & Cluster
# Use just the first 3 PC dimensions
X_pca = X_pca_full[:, :3]

# Silhouette scores for k = 2 through 7
sil_scores = [
    (k, silhouette_score(X_pca, KMeans(n_clusters=k, random_state=42)
                           .fit_predict(X_pca)))
    for k in range(2, 8)
]
best_k = max(sil_scores, key=lambda x: x[1])[0]
print("Silhouette scores:", sil_scores)
print("Best k by silhouette:", best_k)

# Fit K-Means with the chosen k
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_pca)
player_feats['cluster'] = kmeans.labels_


Silhouette scores: [(2, np.float64(0.2827345653162998)), (3, np.float64(0.2605321391872537)), (4, np.float64(0.23160803245912473)), (5, np.float64(0.22597670730597508)), (6, np.float64(0.23450770026662565)), (7, np.float64(0.24398265616012668))]
Best k by silhouette: 2


In [34]:
# Cluster Summaries
# Build a DataFrame of PC1–PC3 with axis labels
pc_df = pd.DataFrame(
    X_pca,
    columns=['Power (PC1)', 'Quality Of Contact (PC2)', 'Swing Control (PC3)']
)
pc_df['cluster'] = player_feats['cluster']

# Compute mean PC scores and cluster sizes
cluster_means = pc_df.groupby('cluster')[['Power (PC1)', 'Quality Of Contact (PC2)', 'Swing Control (PC3)']].mean()
cluster_counts = pc_df['cluster'].value_counts().sort_index()

print("\nCluster Means:\n", cluster_means)
print("\nCluster Sizes:\n", cluster_counts)



Cluster Means:
          Power (PC1)  Quality Of Contact (PC2)  Swing Control (PC3)
cluster                                                            
0          -3.091149                  1.521547            -0.104278
1           3.499754                  1.442026             0.265681
2           0.165805                 -2.547283            -0.109419

Cluster Sizes:
 cluster
0    233
1    194
2    249
Name: count, dtype: int64


In [35]:
# 3D PCA Scatter Plot with Updated Archetypes
cluster_map = {
    0: 'Contact Grinders',
    1: 'Power-Contact Hitters',
    2: 'Fly-Ball Sluggers'
}
pc_df['Archetype'] = pc_df['cluster'].map(cluster_map)

# Sample up to 20 points per archetype
sampled = (
    pc_df.groupby('Archetype')
         .apply(lambda grp: grp.sample(n=min(len(grp), 20), random_state= 2))
         .reset_index(drop=True)
)

fig = px.scatter_3d(
    sampled,
    x='Power (PC1)', y='Quality Of Contact (PC2)', z='Swing Control (PC3)',
    color='Archetype',
    title='3D PCA Scatter of Hitters by Archetype',
    labels={
        'Power (PC1)':               'Slugging',
        'Quality Of Contact (PC2)':  'Contact',
        'Swing Control (PC3)':       'Swing Control',
        'Archetype':                 'Hitter Archetype'
    }
)
fig.show()






In [None]:
# Attach Archetype Labels & Export Player-Level Archetypes

# Map numeric clusters to interpreted archetypes
cluster_map = {
    0: 'Contact & Speed Grinders',
    1: 'Fly-Ball Sluggers',
    2: 'Power-Contact Hitters'
}
player_feats['Archetype'] = player_feats['cluster'].map(cluster_map)

# Reorder columns so identifiers and cluster info come first
cols = ['player_id', 'last_name, first_name', 'cluster', 'Archetype'] + \
       [c for c in player_feats.columns if c not in ('player_id','last_name, first_name','cluster','Archetype')]
player_archetypes = player_feats[cols]

# Export to CSV
output_file = "../MLBDATA/Processed/BatterData/PlayerArchetypesClusters2016-2022NoCovid.csv"
player_archetypes.to_csv(output_file, index=False)
print(f"Saved {len(player_archetypes)} players with archetypes to '{output_file}'")

Saved 676 players with archetypes to '../MLBDATA/Processed/BatterData/PlayerArchetypesClusters2016-2022NoCovid.csv'


: 