<a href="https://colab.research.google.com/github/Benk01/Baseball-statistical-analysis/blob/main/neural_network_pitch_classifcation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classifying Baseball Pitches with PyTorch Neural Networks

This notebook uses Statcast-level metrics to train a neural network to identify pitches such as fastballs, sliders, and curveballs.

### Features considered:
- Release speed
- Spin rate
- Horizontal/Vertical break (`pfx_x`, `pfx_z`)
- Release extension


In [None]:
# Install required packages
!pip install pybaseball --quiet

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from pybaseball import statcast
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Using Statcast data from pybaseball library
data = statcast(start_dt="2025-05-01", end_dt="2025-05-10")

# Features used:
# Pitch speed (at time of release), Pitch Spin Rate, Horizontal Break, Vertical Break, Release Extension
features = [
    'release_speed', 'release_spin_rate',
    'pfx_x', 'pfx_z',
    'release_extension'
]
label_col = 'pitch_type'

# Map pitch abreviations to full names for convenience
pitch_type_names = {
    'FF': 'Four-Seam Fastball',
    'SL': 'Slider',
    'CU': 'Curveball',
    'CH': 'Changeup',
    'SI': 'Sinker',
    'FC': 'Cutter',
    'FS': 'Splitter',
}
valid_pitch_types = set(pitch_type_names.keys())

# Filter and drop missing values
data = data.dropna(subset=features + [label_col])
df = data[data[label_col].isin(valid_pitch_types)][features + [label_col]].copy()


In [None]:
# Transform pitch type labels to integers so they can be used in NN
le = LabelEncoder()
df[label_col] = le.fit_transform(df[label_col])
target_names = [pitch_type_names[code] for code in le.classes_]


# Standardize features on a mean = 0, stdev = 1 scale, so all features have the same influence
scaler = StandardScaler()
X = scaler.fit_transform(df[features])

# Extract labels for classification (Pitch type)
y = df[label_col].values

# Convert to tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

In [None]:
# Use a class that inherits from Pytorch Dataset to store pitch data
class PitchDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# init dataset with input data (X_tensor) and target labels (y_tensor)
dataset = PitchDataset(X_tensor, y_tensor)

# Split into training and test sets
# 80/20 is standard split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_set, test_set = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64)


In [None]:
# Custom neural network class
class PitchClassifier(nn.Module):
  # input_dim: number of input features
  # output_dim: number of output classes (pitch types)
    def __init__(self, input_dim, output_dim):
        super().__init__()
        # Keeping it simple with Sequential layering model
        # Separated Linear layers with ReLU (if (x > 0): x, else: 0) to model non-linear relationships
        # A stack of only Linear layers is an over-complicated linear regression model
        self.net = nn.Sequential(
            # Input layer transforms vector of size (num of features) to 20
            nn.Linear(input_dim, 20),
            # ReLU adds non-linearity to the stack to avoid vanishing gradients
            nn.ReLU(),
            # Transforms vector from 20 to 10
            nn.Linear(20, 10),
            nn.ReLU(),
            # Transforms vector from 10 to output_dimension size (number of output classes)
            nn.Linear(10, output_dim)
        )
    def forward(self, x):
        return self.net(x)

model = PitchClassifier(input_dim=X.shape[1], output_dim=len(le.classes_))


In [None]:
# CrossEntropyLoss measures the difference between the predicted and true distributions
# Commonly used for multi-class classification
loss_fn = nn.CrossEntropyLoss()

# Adjusts the model's weights to reduce loss
# Adam is an advanced optimzer that is an improvement over gradient descent since it uses momentum and adaptive learning rates
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(20):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        # resets the gradient after every pass
        optimizer.zero_grad()
        # runs a foward pass by transforming the inputs through all layers of the NN
        out = model(X_batch)
        # calculates the loss or how badly the model performed
        loss = loss_fn(out, y_batch)
        # backwards pass: compute the gradients of the loss with respect to each parameter
        # A positive gradient means increasing that weight increases the loss (bad), so we should decrease it.
        # A negative gradient means increasing that weight decreases the loss (good), so we should increase it.
        loss.backward()
        # applies the gradients and updatest the weights using the optimizer
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


In [None]:
model.eval()
all_preds = []
all_targets = []

# Turn off gradient tracking for faster evaluation
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        # forward pass through the model
        out = model(X_batch)
        # get the predicted class by taking the index with the highest confidence score
        preds = torch.argmax(out, dim=1)
        # totals predictions for analysis
        all_preds.extend(preds.numpy())
        all_targets.extend(y_batch.numpy())

# print evaluation metrics
labels = list(range(len(le.classes_)))
print(classification_report(all_targets, all_preds, labels=labels, target_names=target_names))

# Confusion Matrix shows true/false positive predictions
cm = confusion_matrix(all_targets, all_preds, labels=labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=target_names, yticklabels=target_names, cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Pitch Type Confusion Matrix (7 Pitch Types)")
plt.show()
