## Modeling

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
import pickle
import imblearn as im

In [2]:
infile = open('dummy2.pkl','rb')
df = pickle.load(infile)
infile.close()

This notebook has simplified models for streamlit. To make it easy for people to use I included 5 features (speed, spin rate, location, horizontal movement, and vertical movement). The models are all random forests since random forests have the best accuracy for this data. There will a a streamlit app for each pitch in the data.

# 4 seam fastball

In [3]:
df1 = df.copy()

In [4]:
df1 = df1[df1.pitch_name == '4-Seam Fastball']

In [5]:
z = df1[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [6]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [7]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [8]:
y = z[['description']]

In [9]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [10]:
X_over, y_over = oversample.fit_resample(X, y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [12]:
rf1 = RandomForestClassifier(n_estimators = 10)
rf1.fit(X_train, y_train)

  rf1.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [13]:
feature_names = ['release_speed', 'release_spin_rate','zone', 'pfx_x', 'pfx_z']

In [14]:
filename = 'model.sav'
pickle.dump(rf1, open(filename, 'wb'))

In [15]:
filename = 'list.pkl'
pickle.dump(feature_names, open(filename, 'wb'))

## Sinker

In [16]:
df2 = df.copy()

In [17]:
df2 = df2[df2.pitch_name == 'Sinker']

In [18]:
z = df2[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [19]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [20]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [21]:
y = z[['description']]

In [22]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [23]:
X_over, y_over = oversample.fit_resample(X, y)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [25]:
rf2 = RandomForestClassifier(n_estimators = 10)
rf2.fit(X_train, y_train)

  rf2.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [26]:
filename = 'model2.sav'
pickle.dump(rf2, open(filename, 'wb'))

## Slider

In [27]:
df3 = df.copy()

In [28]:
df3 = df3[df3.pitch_name == 'Slider']

In [29]:
z = df3[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [30]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [31]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [32]:
y = z[['description']]

In [33]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [34]:
X_over, y_over = oversample.fit_resample(X, y)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [36]:
rf3 = RandomForestClassifier(n_estimators = 10)
rf3.fit(X_train, y_train)

  rf3.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [37]:
filename = 'model3.sav'
pickle.dump(rf3, open(filename, 'wb'))

## Changeup 

In [38]:
df4 = df.copy()

In [39]:
df4 = df4[df4.pitch_name == 'Changeup']

In [40]:
z = df4[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [41]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [42]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [43]:
y = z[['description']]

In [44]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [45]:
X_over, y_over = oversample.fit_resample(X, y)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [47]:
rf4 = RandomForestClassifier(n_estimators = 10)
rf4.fit(X_train, y_train)

  rf4.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [48]:
filename = 'model4.sav'
pickle.dump(rf4, open(filename, 'wb'))

## Curveball 

In [49]:
df5 = df.copy()

In [50]:
df5 = df5[df5.pitch_name == 'Curveball']

In [51]:
z = df5[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [52]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [53]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [54]:
y = z[['description']]

In [55]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [56]:
X_over, y_over = oversample.fit_resample(X, y)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [58]:
rf5 = RandomForestClassifier(n_estimators = 10)
rf5.fit(X_train, y_train)

  rf5.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [59]:
filename = 'model5.sav'
pickle.dump(rf5, open(filename, 'wb'))

## Cutter

In [60]:
df6 = df.copy()

In [61]:
df6 = df6[df6.pitch_name == 'Cutter']

In [62]:
z = df6[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [63]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [64]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [65]:
y = z[['description']]

In [66]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [67]:
X_over, y_over = oversample.fit_resample(X, y)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [69]:
rf6 = RandomForestClassifier(n_estimators = 10)
rf6.fit(X_train, y_train)

  rf6.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [70]:
filename = 'model6.sav'
pickle.dump(rf6, open(filename, 'wb'))

## Knuckle Curve

In [71]:
df7 = df.copy()

In [72]:
df7 = df7[df7.pitch_name == 'Knuckle Curve']

In [73]:
z = df7[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [74]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [75]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [76]:
y = z[['description']]

In [77]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [78]:
X_over, y_over = oversample.fit_resample(X, y)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [80]:
rf7 = RandomForestClassifier(n_estimators = 10)
rf7.fit(X_train, y_train)

  rf7.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [81]:
filename = 'model7.sav'
pickle.dump(rf7, open(filename, 'wb'))

## Split-Finger Fastball

In [82]:
df8 = df.copy()

In [83]:
df8 = df8[df8.pitch_name == 'Split-Finger']

In [84]:
z = df8[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [85]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [86]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [87]:
y = z[['description']]

In [88]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [89]:
X_over, y_over = oversample.fit_resample(X, y)

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [91]:
rf8 = RandomForestClassifier(n_estimators = 10)
rf8.fit(X_train, y_train)

  rf8.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [92]:
filename = 'model8.sav'
pickle.dump(rf8, open(filename, 'wb'))

## Knuckleball

In [93]:
df9 = df.copy()

In [94]:
df9 = df9[df9.pitch_name == 'Knuckleball']

In [95]:
z = df9[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z','description']]

In [96]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [97]:
X = z[['release_speed', 'release_spin_rate', 'zone','pfx_x', 'pfx_z']]

In [98]:
y = z[['description']]

In [99]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [100]:
X_over, y_over = oversample.fit_resample(X, y)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [102]:
rf9 = RandomForestClassifier(n_estimators = 10)
rf9.fit(X_train, y_train)

  rf9.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [103]:
filename = 'model9.sav'
pickle.dump(rf9, open(filename, 'wb'))