# MLB Pitcher Clustering

Import modules needed

In [23]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

Read from csv file

In [24]:
pitch = pd.read_csv("pitching.csv")

  pitch = pd.read_csv("pitching.csv")


Filter valid rows

In [25]:
pitch = pitch[pitch['stattype'].isin(['value', 'official'])]

In [26]:
pitch['season'] = pitch['date'].astype(str).str[:4].astype(int)


In [27]:
agg = pitch.groupby(['id', 'season']).agg({
    'p_ipouts': 'sum',
    'p_bfp': 'sum',
    'p_h': 'sum',
    'p_hr': 'sum',
    'p_w': 'sum',
    'p_iw': 'sum',
    'p_k': 'sum',
    'p_er': 'sum',
    'p_r': 'sum',
    'p_hbp': 'sum',
    'p_gs': 'sum',
    'p_gf': 'sum',
    'save': 'sum'
}).reset_index()


In [28]:
# print(pitch.columns.tolist())

In [29]:
# Innings pitched
agg['IP'] = agg['p_ipouts'] / 3

# Take the pitchers that have played at least 1 inning
agg = agg[agg['IP'] > 0]


In [30]:
# Compute strikeouts per 9 innings
agg['K_9'] = 9 * agg['p_k'] / agg['IP']

# Compute bases on balls per 9 innings
agg['BB_9'] = 9 * (agg['p_w'] - agg['p_iw']) / agg['IP']

# Compute homeruns 9 innings
agg['HR_9'] = 9 * agg['p_hr'] / agg['IP']

# Compute batters hit per 9 innings
agg['HBP_9'] = 9 * agg['p_hbp'] / agg['IP']

# Compute walks and hits per inning pitched
agg['WHIP'] = (agg['p_h'] + agg['p_w']) / agg['IP']

# Compute the earned run average
agg['ERA']  = 9 * agg['p_er'] / agg['IP']

In [31]:
# Compute starting percentage
agg['starter_pct'] = agg['p_gs'] / (agg['p_gs'] + agg['p_gf'] + 1e-6)

# Compute closing percentage
agg['closer_pct']  = agg['p_gf'] / (agg['p_gs'] + agg['p_gf'] + 1e-6)

# Compute the save percentage
agg['save_pct']    = agg['save'] / (agg['p_gs'] + agg['p_gf'] + 1e-6)


In [32]:
features = [
    'K_9', 'BB_9', 'HR_9',
    'WHIP', 'ERA',
    'starter_pct', 'closer_pct', 'save_pct'
]

In [33]:
X = agg[features].replace([np.inf, -np.inf], np.nan).fillna(0)


In [34]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [35]:
kmeans = KMeans(n_clusters=6, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_scaled)

In [36]:
agg['cluster'] = labels

In [None]:
score = silhouette_score(X_scaled, labels)
print("Silhouette Score:", score)

In [None]:
agg.groupby('cluster')[features].mean()

In [None]:
agg['role'] = agg['cluster'].map({
    0: 'starter',
    1: 'reliever',
    2: 'starter',
    3: 'middle_relief',
    4: 'closer',
    5: 'noise'
})

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Run PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)

agg['PCA1'] = pca_components[:, 0]
agg['PCA2'] = pca_components[:, 1]

# Plot
plt.figure(figsize=(10,8))
for cluster in sorted(agg['cluster'].unique()):
    subset = agg[agg['cluster'] == cluster]
    plt.scatter(subset['PCA1'], subset['PCA2'], s=30, alpha=0.7, label=f"Cluster {cluster}")

plt.title("PCA Visualization of Pitcher Clusters")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# from sklearn.manifold import TSNE

# tsne = TSNE(
#     n_components=2,
#     perplexity=30,
#     learning_rate=200,
#     random_state=42,
#     init='pca'
# )

# tsne_results = tsne.fit_transform(X_scaled)

# agg['TSNE1'] = tsne_results[:, 0]
# agg['TSNE2'] = tsne_results[:, 1]

# plt.figure(figsize=(10,8))
# for cluster in sorted(agg['cluster'].unique()):
#     subset = agg[agg['cluster'] == cluster]
#     plt.scatter(subset['TSNE1'], subset['TSNE2'], s=30, alpha=0.7, label=f"Cluster {cluster}")

# plt.title("t-SNE Visualization of Pitcher Clusters")
# plt.xlabel("t-SNE Component 1")
# plt.ylabel("t-SNE Component 2")
# plt.legend()
# plt.grid(True)
# plt.show()


In [None]:
import numpy as np

loadings = pd.DataFrame(
    pca.components_.T,
    columns=['PC1', 'PC2'],
    index=features
)

loadings


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# ------------------------------------------------
# 1. Run PCA to 3 components
# ------------------------------------------------
pca_3d = PCA(n_components=3)
pca_components_3d = pca_3d.fit_transform(X_scaled)

agg['PC1'] = pca_components_3d[:, 0]
agg['PC2'] = pca_components_3d[:, 1]
agg['PC3'] = pca_components_3d[:, 2]

# ------------------------------------------------
# 2. Prepare colors
# ------------------------------------------------
unique_clusters = sorted(agg['cluster'].unique())
palette = sns.color_palette("tab10", n_colors=len(unique_clusters))

cluster_colors = {cluster: palette[i] for i, cluster in enumerate(unique_clusters)}

# ------------------------------------------------
# 3. 3D scatter plot
# ------------------------------------------------
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

for cluster in unique_clusters:
    subset = agg[agg['cluster'] == cluster]
    ax.scatter(
        subset['PC1'], subset['PC2'], subset['PC3'],
        s=40,
        color=cluster_colors[cluster],
        alpha=0.75,
        label=f"Cluster {cluster}"
    )

ax.set_title("3D PCA Visualization of Pitcher Clusters", fontsize=15)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")

ax.legend()
plt.show()


In [None]:
# %matplotlib notebook