# MLB Playoff Clustering (2007–2024)
This notebook uses K-Means clustering and PCA to analyze MLB playoff teams from 2007–2024. Advanced stats such as OPS+, xFIP, Bullpen WPA, and Contact % are used to identify team types and evaluate which team archetypes are most likely to win the World Series.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
# Upload your Excel file first
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Load 2024 stat data and Results sheet
df = pd.read_excel(file_name, sheet_name='2024')
results_df = pd.read_excel(file_name, sheet_name='Results')

## Clean and Prepare Data

In [None]:
# Clean columns
results_df.columns = results_df.columns.str.strip()
df.columns = df.columns.str.strip()

# Stat columns used in the model
stats_cols = ['OPS+', 'OPS In High Leverage', 'Contact %', 'Bullpen WPA',
              'K-BB %', 'xFIP', 'DRS', 'DIFF', 'BsR', 'RE24', 'RP LOB %']

# Convert all to numeric
for col in stats_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing values in stat columns
df_clean = df.dropna(subset=stats_cols).copy()

# Merge in correct playoff results
df_clean = df_clean.merge(results_df[['Year', 'Team', 'Result']], on=['Year', 'Team'], how='left')

## K-Means Clustering and PCA

In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clean[stats_cols])

# Run KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
df_clean['Cluster'] = kmeans.fit_predict(X_scaled)

# PCA for 2D Visualization
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)
df_clean['PCA1'] = components[:, 0]
df_clean['PCA2'] = components[:, 1]

# Plot clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_clean, x='PCA1', y='PCA2', hue='Cluster', palette='tab10')
plt.title('K-Means Clusters of 2007–2024 Playoff Teams')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.grid(True)
plt.show()

## Analyze Playoff Results by Cluster

In [None]:
# Use clean result column
cluster_results = df_clean.groupby('Cluster')['Result'].value_counts().unstack().fillna(0).astype(int)
cluster_results['WS Win %'] = (cluster_results.get('WS WIN', 0) / cluster_results.sum(axis=1)).round(2)
cluster_results