## Modeling

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
import pickle
import imblearn as im

In [2]:
infile = open('dummy2.pkl','rb')
df = pickle.load(infile)
infile.close()

Here are the gradient boosting and logistic regression models for all pitches. They are less accurate than the random forest model for all pitches.

# 4 seam fastball

In [3]:
df1 = df.copy()

In [4]:
df1 = df1[df1.pitch_name == '4-Seam Fastball']

In [5]:
z = df1[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [6]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [7]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [8]:
y = z[['description']]

In [10]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [11]:
X_over, y_over = oversample.fit_resample(X, y)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
gb1 = GradientBoostingClassifier(n_estimators = 10)
gb1.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [15]:
lr1 = LogisticRegression()
lr1.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [16]:
gb1.score(X_test, y_test)

0.6214187515867428

In [17]:
lr1.score(X_test, y_test)

0.6244365566365365

## Sinker

In [18]:
df2 = df.copy()

In [19]:
df2 = df2[df2.pitch_name == 'Sinker']

In [20]:
z = df2[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [21]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [22]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [23]:
y = z[['description']]

In [24]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [25]:
X_over, y_over = oversample.fit_resample(X, y)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [27]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
gb2 = GradientBoostingClassifier(n_estimators = 10)
gb2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [29]:
lr2 = LogisticRegression()
lr2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [30]:
gb2.score(X_test, y_test)

0.6257682077724017

In [31]:
lr2.score(X_test, y_test)

0.5992519903049229

## Slider

In [32]:
df3 = df.copy()

In [33]:
df3 = df3[df3.pitch_name == 'Slider']

In [34]:
z = df3[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [35]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [36]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [37]:
y = z[['description']]

In [38]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [39]:
X_over, y_over = oversample.fit_resample(X, y)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [41]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [42]:
gb3 = GradientBoostingClassifier(n_estimators = 10)
gb3.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [43]:
lr3 = LogisticRegression()
lr3.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [44]:
gb3.score(X_test, y_test)

0.6977524116147757

In [45]:
lr3.score(X_test, y_test)

0.6929352796985329

## Changeup 

In [46]:
df4 = df.copy()

In [47]:
df4 = df4[df4.pitch_name == 'Changeup']

In [48]:
z = df4[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [49]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [50]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [51]:
y = z[['description']]

In [52]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [53]:
X_over, y_over = oversample.fit_resample(X, y)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [55]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [56]:
gb4 = GradientBoostingClassifier(n_estimators = 10)
gb4.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [57]:
lr4 = LogisticRegression()
lr4.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [58]:
gb4.score(X_test, y_test)

0.62372545784524

In [59]:
lr4.score(X_test, y_test)

0.6345982511136776

## Curveball 

In [60]:
df5 = df.copy()

In [61]:
df5 = df5[df5.pitch_name == 'Curveball']

In [62]:
z = df5[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [63]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [64]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [65]:
y = z[['description']]

In [66]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [67]:
X_over, y_over = oversample.fit_resample(X, y)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [69]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [70]:
gb5 = GradientBoostingClassifier(n_estimators = 10)
gb5.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [71]:
lr5 = LogisticRegression()
lr5.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [72]:
gb5.score(X_test, y_test)

0.7092370147925704

In [73]:
lr5.score(X_test, y_test)

0.7066510955399844

## Cutter

In [74]:
df6 = df.copy()

In [75]:
df6 = df6[df6.pitch_name == 'Cutter']

In [76]:
z = df6[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [77]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [78]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [79]:
y = z[['description']]

In [80]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [81]:
X_over, y_over = oversample.fit_resample(X, y)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [83]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [84]:
gb6 = GradientBoostingClassifier(n_estimators = 10)
gb6.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [85]:
lr6 = LogisticRegression()
lr6.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [86]:
gb6.score(X_test, y_test)

0.6506138635375923

In [87]:
lr6.score(X_test, y_test)

0.6313016079965232

## Knuckle Curve

In [88]:
df7 = df.copy()

In [89]:
df7 = df7[df7.pitch_name == 'Knuckle Curve']

In [90]:
z = df7[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [91]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [92]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [93]:
y = z[['description']]

In [94]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [95]:
X_over, y_over = oversample.fit_resample(X, y)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [97]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [98]:
gb7 = GradientBoostingClassifier(n_estimators = 10)
gb7.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [99]:
lr7 = LogisticRegression()
lr7.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [100]:
gb7.score(X_test, y_test)

0.735921972764078

In [101]:
lr7.score(X_test, y_test)

0.7343577475156422

## Split-Finger Fastball

In [102]:
df8 = df.copy()

In [103]:
df8 = df8[df8.pitch_name == 'Split-Finger']

In [104]:
z = df8[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [105]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [106]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [107]:
y = z[['description']]

In [108]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [109]:
X_over, y_over = oversample.fit_resample(X, y)

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [111]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [112]:
gb8 = GradientBoostingClassifier(n_estimators = 10)
gb8.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [113]:
lr8 = LogisticRegression()
lr8.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [114]:
gb8.score(X_test, y_test)

0.6669545903257651

In [115]:
lr8.score(X_test, y_test)

0.680898321816387

## Knuckleball

In [116]:
df9 = df.copy()

In [117]:
df9 = df9[df9.pitch_name == 'Knuckleball']

In [118]:
z = df9[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone','pfx_x', 'pfx_z','description']]

In [119]:
z.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.dropna(inplace = True)


In [120]:
X = z[['spin_axis', 'release_speed', 'release_spin_rate', 'az', 'ay', 'ax', 'vz0', 'vy0', 'vx0', 'zone', 'pfx_x', 'pfx_z']]

In [121]:
y = z[['description']]

In [122]:
oversample = im.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [123]:
X_over, y_over = oversample.fit_resample(X, y)

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, stratify =y_over, random_state =5)

In [125]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [126]:
gb9 = GradientBoostingClassifier(n_estimators = 10)
gb9.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(n_estimators=10)

In [127]:
lr9 = LogisticRegression()
lr9.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [128]:
gb9.score(X_test, y_test)

0.8793103448275862

In [129]:
lr9.score(X_test, y_test)

0.8103448275862069

See the README for conclusions and analysis