In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

In [None]:
# --- Configuration ---
FILE_PATH = "ohtani_2024_pa_results.csv" # Replace with your actual file path
TARGET_COLUMN = 'target'
TEST_SIZE = 0.2
RANDOM_STATE = 42

# --- Load Data ---
try:
    df = pd.read_csv(FILE_PATH)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {FILE_PATH}")
    # Handle the error appropriately, e.g., exit or prompt user
    exit()

# --- Define Target Variable ---
# Target: 1 if hit (single, double, triple, home_run) or walk, 0 otherwise
positive_events = ['single', 'double', 'triple', 'home_run', 'walk']
df[TARGET_COLUMN] = df['events'].apply(lambda x: 1 if isinstance(x, str) and x.lower() in positive_events else 0)

# --- Feature Engineering ---
# Create 'opponent_team' feature
df['opponent_team'] = df.apply(lambda row: row['away_team'] if row['home_team'] == 'LAD' else row['home_team'], axis=1)

# Select features (Dropping columns not used for training)
# Note: Consider carefully which columns to drop. 'bb_type' might be useful if handled correctly.
features_to_drop = ['events', 'launch_speed', 'launch_angle', 'hit_distance_sc', 'bb_type', 'home_team', 'away_team', TARGET_COLUMN]
X = df.drop(columns=features_to_drop)
y = df[TARGET_COLUMN]

print(f"Initial features: {X.columns.tolist()}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

# --- Handle Categorical Features ---
# One-Hot Encode nominal features
categorical_cols_ohe = ['pitch_name', 'opponent_team']
X = pd.get_dummies(X, columns=categorical_cols_ohe, dummy_na=False, dtype=int) # dummy_na=False explicitly avoids creating NaN columns

# Label Encode 'pitcher' (assuming high cardinality makes OHE less suitable)
# Using try-except for robustness if 'pitcher' column is missing
try:
    le = LabelEncoder()
    X['pitcher'] = le.fit_transform(X['pitcher'].astype(str)) # Convert to string to handle potential mixed types/NaNs before encoding
    print("Label encoded 'pitcher'.")
except KeyError:
    print("Warning: 'pitcher' column not found for label encoding.")


# --- Identify Numerical Features (After OHE) ---
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
print(f"Numerical columns for imputation/scaling: {numerical_cols}")

# --- Handle Missing Numerical Values (Imputation with Mean) ---
# Impute BEFORE splitting
for col in numerical_cols:
    if X[col].isnull().any():
        mean_val = X[col].mean()
        X[col] = X[col].fillna(mean_val)
        print(f"Imputed NaNs in '{col}' with mean ({mean_val:.2f}).")

# Verify no remaining NaNs (optional but recommended)
if X.isnull().sum().sum() > 0:
    print("Warning: Remaining NaNs detected after imputation!")
    print(X.isnull().sum())
else:
    print("No NaNs remaining in features.")

In [None]:
# --- Train/Test Split ---
# Stratify ensures the proportion of target classes is similar in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE,
                                                    stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# --- Scale Numerical Features ---
# Fit scaler ONLY on training data, then transform both train and test
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols]) # Use transform, not fit_transform, on test data

print("Numerical features scaled.")

In [None]:
# --- Define Model Architecture ---
# Incorporating Batch Normalization and L2 Regularization
INPUT_DIM = X_train.shape[1]
L2_REG = 0.001 # Regularization factor

model = models.Sequential([
    layers.Dense(64, kernel_regularizer=l2(L2_REG), input_shape=(INPUT_DIM,)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.5), # Dropout helps prevent overfitting

    layers.Dense(32, kernel_regularizer=l2(L2_REG)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.5),

    layers.Dense(1, activation='sigmoid') # Sigmoid for binary classification output
])

# --- Compile Model ---
model.compile(optimizer='adam', # Adam is a common default choice
              loss='binary_crossentropy', # Standard loss for binary classification
              metrics=['accuracy', keras.metrics.AUC(name='auc')]) # Include AUC metric

model.summary() # Print model architecture

In [None]:
# --- Training Configuration ---
EPOCHS = 100
BATCH_SIZE = 32
EARLY_STOPPING_PATIENCE = 15 # Increased patience
LR_REDUCTION_PATIENCE = 5

# --- Callbacks ---
# Early Stopping: Stop training if validation loss doesn't improve
early_stopping = EarlyStopping(monitor='val_loss',
                             patience=EARLY_STOPPING_PATIENCE,
                             restore_best_weights=True, # Keep the best model weights found
                             verbose=1)

# Reduce Learning Rate on Plateau: Reduce LR if validation loss stagnates
reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.2, # Reduce LR by a factor of 5
                              patience=LR_REDUCTION_PATIENCE,
                              min_lr=0.00001, # Minimum learning rate
                              verbose=1)

# --- Class Weights (Handle Imbalance) ---
# Calculate weights to give more importance to the minority class during training
class_weights = class_weight.compute_class_weight('balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train)
class_weight_dict = dict(enumerate(class_weights))
print(f"Calculated class weights: {class_weight_dict}")


# --- Train the Model ---
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping, reduce_lr],
                    class_weight=class_weight_dict, # Apply class weights
                    verbose=1) # Set verbose=1 or 2 to see progress

In [None]:
# --- Evaluate on Test Set ---
loss, accuracy, auc = model.evaluate(X_test, y_test, verbose=0)
print(f"\n--- Test Set Evaluation ---")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC: {auc:.4f}")

# --- Predictions ---
y_pred_prob = model.predict(X_test) # Get probabilities
y_pred_binary = (y_pred_prob > 0.5).astype(int) # Convert probabilities to binary predictions (using 0.5 threshold initially)

# --- Classification Report & Confusion Matrix ---
print("\n--- Classification Report (Threshold=0.5) ---")
print(classification_report(y_test, y_pred_binary))

print("\n--- Confusion Matrix (Threshold=0.5) ---")
print(confusion_matrix(y_test, y_pred_binary))

# --- ROC Curve ---
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ANN (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Chance') # Diagonal line
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid(True)
plt.show()

# --- Plot Training History ---
def plot_training_history(history):
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))

    # Plot Loss
    ax[0].plot(history.history['loss'], label='Train Loss')
    ax[0].plot(history.history['val_loss'], label='Validation Loss')
    ax[0].set_title('Model Loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(loc='upper right')
    ax[0].grid(True)

    # Plot Accuracy / AUC
    metric_to_plot = 'auc' if 'auc' in history.history else 'accuracy'
    val_metric_to_plot = 'val_auc' if 'val_auc' in history.history else 'val_accuracy'

    ax[1].plot(history.history[metric_to_plot], label=f'Train {metric_to_plot.capitalize()}')
    ax[1].plot(history.history[val_metric_to_plot], label=f'Validation {metric_to_plot.capitalize()}')
    ax[1].set_title(f'Model {metric_to_plot.capitalize()}')
    ax[1].set_ylabel(metric_to_plot.capitalize())
    ax[1].set_xlabel('Epoch')
    ax[1].legend(loc='lower right')
    ax[1].grid(True)

    plt.tight_layout()
    plt.show()

print("\n--- Training History ---")
plot_training_history(history)