In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read File

In [None]:
import pandas as pd
import glob
import os
import numpy as np

# Full NCAA Dataset
df = pd.read_csv("/content/drive/MyDrive/USC Baseball/Datasets/22_23_NCAA_Dataset.csv")

  df = pd.read_csv("/content/drive/MyDrive/USC Baseball/Datasets/22_23_NCAA_Dataset.csv")


# Data Processing



In [None]:
df2 = df.copy()
validTeam = ['SOU_TRO', 'UCLA', 'IOW_HAW', 'PEN_NIT', 'ARI_WIL', 'VAN_COM', 'ARK_RAZ', 'WAK_DEA', 'DUK_BLU', 'WM_TRI', 'MIN_GOL', 'TUL_GRE', 'TCU_HFG', 'TEN_VOL', 'LSU_TIG']
filtered_df = df2[df2['HomeTeam'].isin(validTeam)]

In [None]:
cleandf = filtered_df[filtered_df['TaggedPitchType'].str.contains("Undefined|Other|,|Knuckleball|OneSeamFastBall") == False]
cleanD1 = cleandf[cleandf['Level'] == "D1"]
df3 = cleanD1.dropna(subset=['RelSpeed', 'SpinRate', 'InducedVertBreak', 'HorzBreak'])

# changes some tags to fit our definitions
df3.loc[df3['InducedVertBreak'] < -10, 'TaggedPitchType'] = "Curveball"
df3.loc[df3['InducedVertBreak'] > 19, 'TaggedPitchType'] = "Fastball"
df3.loc[df3['SpinRate'] < 1400, 'TaggedPitchType'] = "ChangeUp"

In [None]:
# Recodes pitch types into 5 different buckets

# sinkers, 4 seam, 2 seam, 1 seam = fastball
# cutters = sliders
# splitter = changeup

# creates list of conditions that determine whether TaggedPitchType is fastball/slider/changeup/curveball
conditions = [
    (df3['TaggedPitchType'].str.contains('Fastball|FourSeamFastBall|TwoSeamFastBall|OneSeamFastBall|Sinker')),  # fastballs
    (df3['TaggedPitchType'].str.contains('Slider|Cutter')),  # sliders
    (df3['TaggedPitchType'].str.contains('ChangeUp|Splitter')),  # changeups
    (df3['TaggedPitchType'].str.contains('Curveball')) # curveballs
]

# creates a list of values to assign for pitch types
values = [0, 1, 2, 3]
# creates new column determining pitch type and assign values to it based on the conditions; assigns 4 if an other pitch
df3['pitchType'] = np.select(conditions, values, default = 4)

## Identification of Important Variables

In [None]:
cleanRHP = df3[df3['PitcherThrows'] == "Right"]
cleanLHP = df3[df3['PitcherThrows'] == "Left"]

In [None]:
df4 = cleanRHP[['TaggedPitchType', 'RelSpeed', 'VertRelAngle', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 'VertBreak', 'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'HorzApprAngle']].dropna()
df4.corr()

  df4.corr()


Unnamed: 0,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,VertBreak,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle
RelSpeed,1.0,-0.6149,-0.332697,0.077472,0.627208,0.887316,0.754476,0.626095,0.697987,0.413768
VertRelAngle,-0.6149,1.0,0.280765,0.053174,-0.444517,-0.690127,-0.661045,-0.421064,-0.093105,-0.230352
HorzRelAngle,-0.332697,0.280765,1.0,0.178088,-0.458409,-0.288598,-0.234566,-0.500811,-0.153384,0.284044
SpinRate,0.077472,0.053174,0.178088,1.0,-0.419533,-0.054818,-0.09303,-0.389104,-0.018405,-0.281399
SpinAxis,0.627208,-0.444517,-0.458409,-0.419533,1.0,0.700395,0.660691,0.896072,0.576124,0.608371
VertBreak,0.887316,-0.690127,-0.288598,-0.054818,0.700395,1.0,0.968041,0.640678,0.784459,0.467106
InducedVertBreak,0.754476,-0.661045,-0.234566,-0.09303,0.660691,0.968041,1.0,0.578626,0.764816,0.44368
HorzBreak,0.626095,-0.421064,-0.500811,-0.389104,0.896072,0.640678,0.578626,1.0,0.514411,0.687596
VertApprAngle,0.697987,-0.093105,-0.153384,-0.018405,0.576124,0.784459,0.764816,0.514411,1.0,0.440901
HorzApprAngle,0.413768,-0.230352,0.284044,-0.281399,0.608371,0.467106,0.44368,0.687596,0.440901,1.0


In [None]:
df6 = df4.corr()
csv = df6.to_csv("/content/drive/Shareddrives/USC Baseball/ML Stuff/PitchTypeCorr*.csv", index=False)

  df6 = df4.corr()


Determines the Mean, STD, Median, 25th %, and 75th% of Variables Grouped by double or Not double

In [None]:
df5 = cleanLHP[['pitchType', 'RelSpeed', 'VertRelAngle', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 'VertBreak', 'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'HorzApprAngle']].dropna()

In [None]:
def compute_statistics_by_target(df5, target_column, predictor_columns):
    # Group the dataframe by the target variable
    grouped = df5.groupby(target_column)

    # Initialize an empty list to store rows
    rows = []

    for target_value, group in grouped:
        # Calculate the statistics for each predictor column
        means = group[predictor_columns].mean(numeric_only=True)
        std_devs = group[predictor_columns].std(numeric_only=True)
        medians = group[predictor_columns].median(numeric_only=True)
        percentile_25 = group[predictor_columns].quantile(0.25, numeric_only=True)
        percentile_75 = group[predictor_columns].quantile(0.75, numeric_only=True)

        # Combine the statistics into a single row
        row = list(means) + list(std_devs) + list(medians) + list(percentile_25) + list(percentile_75)

        print("Length of row:", len(row))  # Print the length of the row
        rows.append(row)

    # Generate column names for the DataFrame
    column_names = [f"{stat}_{col}" for stat in ['Mean', 'Std_Dev', 'Median', 'Percentile_25', 'Percentile_75'] for col in predictor_columns]

    print("Length of column names:", len(column_names))  # Print the length of column names

    # Create a DataFrame from the list of rows
    statistics_df = pd.DataFrame(rows, index=grouped.groups.keys(), columns=column_names)

    return statistics_df

# Example usage:
target_column_name = 'pitchType'
predictor_columns_list = ['RelSpeed', 'VertRelAngle', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 'VertBreak', 'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'HorzApprAngle']
statistics_table = compute_statistics_by_target(df5, target_column_name, predictor_columns_list)

print(statistics_table)


Length of column names: 50
Empty DataFrame
Columns: [Mean_RelSpeed, Mean_VertRelAngle, Mean_HorzRelAngle, Mean_SpinRate, Mean_SpinAxis, Mean_VertBreak, Mean_InducedVertBreak, Mean_HorzBreak, Mean_VertApprAngle, Mean_HorzApprAngle, Std_Dev_RelSpeed, Std_Dev_VertRelAngle, Std_Dev_HorzRelAngle, Std_Dev_SpinRate, Std_Dev_SpinAxis, Std_Dev_VertBreak, Std_Dev_InducedVertBreak, Std_Dev_HorzBreak, Std_Dev_VertApprAngle, Std_Dev_HorzApprAngle, Median_RelSpeed, Median_VertRelAngle, Median_HorzRelAngle, Median_SpinRate, Median_SpinAxis, Median_VertBreak, Median_InducedVertBreak, Median_HorzBreak, Median_VertApprAngle, Median_HorzApprAngle, Percentile_25_RelSpeed, Percentile_25_VertRelAngle, Percentile_25_HorzRelAngle, Percentile_25_SpinRate, Percentile_25_SpinAxis, Percentile_25_VertBreak, Percentile_25_InducedVertBreak, Percentile_25_HorzBreak, Percentile_25_VertApprAngle, Percentile_25_HorzApprAngle, Percentile_75_RelSpeed, Percentile_75_VertRelAngle, Percentile_75_HorzRelAngle, Percentile_75_S

In [None]:
csv = statistics_table.to_csv("/content/drive/Shareddrives/USC Baseball/ML Stuff/PitchTypeClassLHPANOVA*.csv", index=False)

#  Model Creation

In [None]:
cleanRHP = df3[df3['PitcherThrows'] == "Right"]
cleanLHP = df3[df3['PitcherThrows'] == "Left"]

df = cleanRHP

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.model_selection import train_test_split
import matplotlib as plt
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# sinkers, 4 seam, 2 seam, 1 seam = fastball
# cutters = sliders
# splitter = changeup

# creates list of conditions that determine whether TaggedPitchType is fastball/slider/changeup/curveball
conditions = [
    (df['TaggedPitchType'] == 'Fastball'),
    (df['TaggedPitchType'] == 'Slider'),
    (df['TaggedPitchType'] == 'ChangeUp'),
    (df['TaggedPitchType'] == 'Curveball'),
    (df['TaggedPitchType'] == 'Sinker'),
    (df['TaggedPitchType'] == 'Cutter'),
    (df['TaggedPitchType'] == 'FourSeamFastBall'),
    (df['TaggedPitchType'] == 'Splitter'),
    (df['TaggedPitchType'] == 'TwoSeamFastBall')
]

# creates a list of values to assign for pitch type (1) or not (0)
valuesFB = [1, 0, 0, 0, 1, 0, 1, 0, 1]
# creates new column determining hit or not and assign values to it based on the conditions
df['isFB'] = np.select(conditions, valuesFB)

# creates a list of values to assign for pitch type (1) or not (0)
valuesSL = [0, 1, 0, 0, 0, 1, 0, 0, 0]
# creates new column determining hit or not and assign values to it based on the conditions
df['isSL'] = np.select(conditions, valuesSL)

# creates a list of values to assign for pitch type (1) or not (0)
valuesCH = [0, 0, 1, 0, 0, 0, 0, 1, 0]
# creates new column determining hit or not and assign values to it based on the conditions
df['isCH'] = np.select(conditions, valuesCH)

# creates a list of values to assign for pitch type (1) or not (0)
valuesCU = [0, 0, 0, 1, 0, 0, 0, 0, 0]
# creates new column determining hit or not and assign values to it based on the conditions
df['isCU'] = np.select(conditions, valuesCU)

In [None]:
print(df['isFB'].value_counts())
print(df['isCH'].value_counts())
print(df['isSL'].value_counts())
print(df['isCU'].value_counts())

1    7761
0    5300
Name: isFB, dtype: int64
0    11480
1     1581
Name: isCH, dtype: int64
0    10529
1     2532
Name: isSL, dtype: int64
0    11874
1     1187
Name: isCU, dtype: int64


# Old ML Model Code

In [None]:
minority_class = df[df['isFB'] == 0]
majority_class = df[df['isFB'] == 1]

print("majority class is: " + str(len(majority_class)))
print("minority class is: " + str(len(minority_class)))

print("total is: " + str(len(majority_class)+len(minority_class)))

majority class is: 7761
minority class is: 5300
total is: 13061


In [None]:
# Shuffle the data to avoid order effects
# figure why this don't work
balanced_df = balanced_df.sample(frac=1, random_state=42)


In [None]:
newDF = df.dropna(subset=['RelSpeed', 'SpinRate', 'SpinAxis', 'InducedVertBreak', 'HorzBreak', 'VertRelAngle', 'HorzRelAngle'])
balanced_df = newDF.copy()
rel_data = balanced_df[['RelSpeed', 'SpinRate', 'SpinAxis', 'InducedVertBreak', 'HorzBreak', 'VertRelAngle', 'HorzApprAngle']].to_numpy()
classification = balanced_df['isFB'].to_numpy()

In [None]:
minority = balanced_df[balanced_df['isFB'] == 0]
majority = balanced_df[balanced_df['isFB'] == 1]

print("majority class is: " + str(len(majority)))
print("minority class is: " + str(len(minority)))

print("total is: " + str(len(majority)+len(minority)))

majority class is: 7761
minority class is: 5300
total is: 13061


In [None]:
import numpy as np

splitNum = (len(majority)+len(minority))*.7
print(splitNum)

y = np.unique(classification, return_inverse=True)
y_train = y[1][:int(splitNum)]
y_test = y[1][int(splitNum):]

x = np.unique(classification, return_inverse=True)
x_train = rel_data[:int(splitNum)]
x_test = rel_data[int(splitNum):]

9142.699999999999


# New ML Model Code

In [None]:
cleanRHP = df3[df3['PitcherThrows'] == "Right"]
cleanLHP = df3[df3['PitcherThrows'] == "Left"]

df5 = cleanRHP

In [None]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame df5 with a target column 'pitchType'

# Create separate DataFrames for each class
fb_class = df5[df5['pitchType'] == 0]
sl_class = df5[df5['pitchType'] == 1]
ch_class = df5[df5['pitchType'] == 2]
cu_class = df5[df5['pitchType'] == 3]

# Calculate the desired undersampling factors for the majority class (Fastball)
# and keep the minority classes as they are (ratio = 1)
desired_ratios = {
    0: 1/3,    # For Fastball, keep the minority class size unchanged
    1: 3/5,      # For Slider, keep the minority class size unchanged
    2: 1,      # For ChangeUp, keep the minority class size unchanged
    3: 1.5       # For Curveball, keep the minority class size unchanged
}

# Undersample the majority class to match the desired ratio and calculate example weights
undersampled_classes = []

for class_label in [0, 1, 2, 3]:
    class_df = df5[df5['pitchType'] == class_label]
    desired_ratio = desired_ratios[class_label]

    if class_label == 3:
      undersampled_class = resample(class_df,
                                  replace=True,
                                  n_samples=int(len(class_df) * desired_ratio),
                                  random_state=42)
      print(str(class_label) + " " + str(len(class_df) * desired_ratio))

    else:
        # Undersample the majority classes (Non-curveball pitches)
        undersampled_class = resample(class_df,
                                      replace=False,
                                      n_samples=int(len(class_df) * desired_ratio),
                                      random_state=42)
        print(str(class_label) + " " + str(len(class_df) * desired_ratio))

    # Calculate example weights based on the undersampling factor
    undersampled_class['example_weight'] = 1 / desired_ratio

    undersampled_classes.append(undersampled_class)

# Concatenate the undersampled classes
balanced_df = pd.concat(undersampled_classes)

# Shuffle the data to avoid order effects
balanced_df = balanced_df.sample(frac=1, random_state=42)

# Split the dataset into features and target
rel_data = balanced_df[['RelSpeed', 'SpinRate', 'InducedVertBreak', 'HorzBreak', 'SpinAxis', 'VertRelAngle', 'HorzApprAngle']].to_numpy()
classification = balanced_df['pitchType'].to_numpy()

# Determine the train-test split
sampledTotal = len(balanced_df)
trainTotal = int(0.75 * sampledTotal)
print("Train Total: " + str(trainTotal))

# Split data into training and testing sets
x_train = rel_data[:trainTotal]
x_test = rel_data[trainTotal:]
y_train = classification[:trainTotal]
y_test = classification[trainTotal:]
sample_weights_train = balanced_df['example_weight'][:trainTotal]
sample_weights_test = balanced_df['example_weight'][trainTotal:]

0 2587.0
1 1519.2
2 1581
3 1780.5
Train Total: 5600


# Actual Model

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import preprocessing
import seaborn as sns
from sklearn import metrics

names = [
    "Nearest Neighbors",
    "Decision Tree",
    "Random Forest",
]

classifiers = [
    #KNeighborsClassifier(4),
    #DecisionTreeClassifier(max_depth=20),
    #DecisionTreeClassifier(max_depth=5),
    #RandomForestClassifier(max_depth=25, n_estimators=40, max_features=1),
    RandomForestClassifier(max_depth=None, n_estimators=50)
]

In [None]:
accuracies = []
for opt in classifiers:
  clf = make_pipeline(StandardScaler(), opt)
  clf.fit(x_train, y_train)

  y_pred = clf.predict((x_test))
  accuracies.append(metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report
target_names = ['fastball', 'slider', 'changeup', 'curveball']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    fastball       0.97      0.97      0.97       616
      slider       0.95      0.88      0.92       399
    changeup       0.94      0.94      0.94       409
   curveball       0.92      0.99      0.96       443

    accuracy                           0.95      1867
   macro avg       0.95      0.94      0.94      1867
weighted avg       0.95      0.95      0.95      1867



In [None]:
print(accuracies)

[0.980862464914519]


In [None]:
print(accuracies)

[0.9480449919657205]


In [None]:
from sklearn.tree import DecisionTreeClassifier
import joblib
from joblib import dump

import os
os.chdir('/content/drive/Shareddrives/USC Baseball/SabermetricModels')

# export the trained model to disk
joblib.dump(clf, 'UpdatedRHPPitchModel.joblib')

['LHP_CU_PitchModel.joblib']