This notebook was a learning experiment to work with creating a model without any auto-ML assistance or similar programs.

Reference:
https://towardsdatascience.com/multiple-linear-regression-model-using-python-machine-learning-d00c78f1172a

Importing Libraries

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd

pitching = pd.read_csv("Pitch Metrics and Outcomes (final) - 2020 - savant_data (Fixed Data).csv")
pitching.head()

Unnamed: 0,player_number,player_name,pitch_type,release_speed,release_spin_rate,zone,event,description
0,656061,"Abreu, Albert",CH,86.1,1950,7,single,hit_into_play
1,656061,"Abreu, Albert",FF,96.6,1907,5,field_out,hit_into_play
2,656061,"Abreu, Albert",FF,97.5,2262,4,home_run,hit_into_play
3,656061,"Abreu, Albert",FF,96.1,1974,5,single,hit_into_play
4,656061,"Abreu, Albert",FF,95.6,1901,1,strikeout,swinging_strike


Check for Null Values

In [None]:
# Checking for null values
print(pitching.info())

# Checking for outliers
print(pitching.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39922 entries, 0 to 39921
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   player_number      39922 non-null  int64  
 1   player_name        39922 non-null  object 
 2   pitch_type         39922 non-null  object 
 3   release_speed      39922 non-null  float64
 4   release_spin_rate  39922 non-null  int64  
 5   zone               39922 non-null  int64  
 6   event              39922 non-null  object 
 7   description        39922 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 2.4+ MB
None
       player_number  release_speed  release_spin_rate          zone
count   39922.000000   39922.000000       39922.000000  39922.000000
mean   594327.046992      88.793515        2279.294299      6.765267
std     64726.883733       5.997927         346.543019      3.707571
min    424144.000000      38.800000          43.000000      1.000000


Data Prep

In [None]:
# Dropping 'player_number' as we have created the dummies for it
pitching.drop(['player_number'], axis = 1, inplace = True)

pitching

Unnamed: 0,player_name,pitch_type,release_speed,release_spin_rate,zone,event,description
0,"Abreu, Albert",CH,86.1,1950,7,single,hit_into_play
1,"Abreu, Albert",FF,96.6,1907,5,field_out,hit_into_play
2,"Abreu, Albert",FF,97.5,2262,4,home_run,hit_into_play
3,"Abreu, Albert",FF,96.1,1974,5,single,hit_into_play
4,"Abreu, Albert",FF,95.6,1901,1,strikeout,swinging_strike
...,...,...,...,...,...,...,...
39917,"Zuber, Tyler",SL,85.5,2687,2,strikeout,swinging_strike
39918,"Zuber, Tyler",SL,83.8,2700,1,strikeout,called_strike
39919,"Zuber, Tyler",SL,86.6,2859,9,strikeout,swinging_strike
39920,"Zuber, Tyler",SL,84.2,2723,14,strikeout,swinging_strike


In [None]:
# Creating dummy variable
zones = pd.get_dummies(pitching['zone'])

#Display zones
zones

Unnamed: 0,1,2,3,4,5,6,7,8,9,11,12,13,14
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39917,0,1,0,0,0,0,0,0,0,0,0,0,0
39918,1,0,0,0,0,0,0,0,0,0,0,0,0
39919,0,0,0,0,0,0,0,0,1,0,0,0,0
39920,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
# One-hot encode the data using pandas get_dummies
pitching = pd.get_dummies(pitching)

# Adding the status to the original housing dataframe
#pitching = pd.concat([pitching, zones], axis = 1)

# Dropping 'zone' as we have created the dummies for it
pitching.drop(['zone'], axis = 1, inplace = True)

# Display the first 5 rows
#pitching.iloc[:,5:].head(5)
pitching

Unnamed: 0,release_speed,release_spin_rate,"player_name_Abreu, Albert","player_name_Abreu, Bryan","player_name_Adam, Jason","player_name_Adams, Austin","player_name_Adams, Chance","player_name_Adrianza, Ehire","player_name_Akin, Keegan","player_name_Alcala, Jorge","player_name_Alcantara, Sandy","player_name_Alexander, Scott","player_name_Alexander, Tyler","player_name_Allard, Kolby","player_name_Allen, Logan","player_name_Almonte, Yency","player_name_Altavilla, Dan","player_name_Alvarado, José","player_name_Alzolay, Adbert","player_name_Anderson, Brett","player_name_Anderson, Chase","player_name_Anderson, Drew","player_name_Anderson, Ian","player_name_Anderson, Nick","player_name_Anderson, Shaun","player_name_Anderson, Tyler","player_name_Andriese, Matt","player_name_Antone, Tejay","player_name_Archer, Chris","player_name_Arcia, Orlando","player_name_Arihara, Kohei","player_name_Armstrong, Shawn","player_name_Arrieta, Jake","player_name_Avilán, Luis","player_name_Bacus, Dakota","player_name_Baez, Michel","player_name_Bailey, Brandon","player_name_Bailey, Homer","player_name_Banda, Anthony","player_name_Baragar, Caleb",...,"player_name_Yacabonis, Jimmy","player_name_Yajure, Miguel","player_name_Yamaguchi, Shun","player_name_Yamamoto, Jordan","player_name_Yarbrough, Ryan","player_name_Yardley, Eric","player_name_Yates, Kirby","player_name_Ynoa, Huascar","player_name_Young, Alex","player_name_Zeuch, T.J.","player_name_Zimmer, Kyle","player_name_Zimmermann, Bruce","player_name_Zimmermann, Jordan","player_name_Zuber, Tyler","player_name_de Geus, Brett","player_name_deGrom, Jacob","player_name_Álvarez, José",pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_SL,event_double,event_double_play,event_field_out,event_fielders_choice,event_fielders_choice_out,event_force_out,event_grounded_into_double_play,event_home_run,event_sac_fly,event_single,event_strikeout,event_triple,event_walk,description_ball,description_called_strike,description_hit_into_play,description_swinging_strike
0,86.1,1950,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,96.6,1907,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,97.5,2262,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
3,96.1,1974,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,95.6,1901,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39917,85.5,2687,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
39918,83.8,2700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
39919,86.6,2859,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
39920,84.2,2723,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


Split Data

In [None]:
from sklearn.model_selection import train_test_split

# We specify random seed so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(pitching, train_size = 0.7, test_size = 0.3, random_state = 100)


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Applying scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['release_speed', 'release_spin_rate']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


Unnamed: 0,release_speed,release_spin_rate,"player_name_Abreu, Albert","player_name_Abreu, Bryan","player_name_Adam, Jason","player_name_Adams, Austin","player_name_Adams, Chance","player_name_Adrianza, Ehire","player_name_Akin, Keegan","player_name_Alcala, Jorge","player_name_Alcantara, Sandy","player_name_Alexander, Scott","player_name_Alexander, Tyler","player_name_Allard, Kolby","player_name_Allen, Logan","player_name_Almonte, Yency","player_name_Altavilla, Dan","player_name_Alvarado, José","player_name_Alzolay, Adbert","player_name_Anderson, Brett","player_name_Anderson, Chase","player_name_Anderson, Drew","player_name_Anderson, Ian","player_name_Anderson, Nick","player_name_Anderson, Shaun","player_name_Anderson, Tyler","player_name_Andriese, Matt","player_name_Antone, Tejay","player_name_Archer, Chris","player_name_Arcia, Orlando","player_name_Arihara, Kohei","player_name_Armstrong, Shawn","player_name_Arrieta, Jake","player_name_Avilán, Luis","player_name_Bacus, Dakota","player_name_Baez, Michel","player_name_Bailey, Brandon","player_name_Bailey, Homer","player_name_Banda, Anthony","player_name_Baragar, Caleb",...,"player_name_Yacabonis, Jimmy","player_name_Yajure, Miguel","player_name_Yamaguchi, Shun","player_name_Yamamoto, Jordan","player_name_Yarbrough, Ryan","player_name_Yardley, Eric","player_name_Yates, Kirby","player_name_Ynoa, Huascar","player_name_Young, Alex","player_name_Zeuch, T.J.","player_name_Zimmer, Kyle","player_name_Zimmermann, Bruce","player_name_Zimmermann, Jordan","player_name_Zuber, Tyler","player_name_de Geus, Brett","player_name_deGrom, Jacob","player_name_Álvarez, José",pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_SL,event_double,event_double_play,event_field_out,event_fielders_choice,event_fielders_choice_out,event_force_out,event_grounded_into_double_play,event_home_run,event_sac_fly,event_single,event_strikeout,event_triple,event_walk,description_ball,description_called_strike,description_hit_into_play,description_swinging_strike
9408,0.863924,0.528546,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
33338,0.903481,0.673377,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
19474,0.787975,0.354267,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
8256,0.735759,0.575721,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
39633,0.743671,0.819411,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,0.843354,0.803185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
79,0.757911,0.778546,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
12119,0.876582,0.597356,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
14147,0.856013,0.612380,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


Building Linear Model

In [None]:
# Dividing the training data set into X and Y
y_train = df_train.pop('release_speed')
X_train = df_train

In [None]:
#Build a linear model

import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

lr_1.summary()

0,1,2,3
Dep. Variable:,release_speed,R-squared:,0.916
Model:,OLS,Adj. R-squared:,0.913
Method:,Least Squares,F-statistic:,378.8
Date:,"Fri, 11 Jun 2021",Prob (F-statistic):,0.0
Time:,17:30:51,Log-Likelihood:,60643.0
No. Observations:,27945,AIC:,-119700.0
Df Residuals:,27164,BIC:,-113300.0
Df Model:,780,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4483,0.004,124.252,0.000,0.441,0.455
release_spin_rate,0.0283,0.003,10.500,0.000,0.023,0.034
"player_name_Abreu, Albert",0.0504,0.011,4.411,0.000,0.028,0.073
"player_name_Abreu, Bryan",0.0376,0.008,4.836,0.000,0.022,0.053
"player_name_Adam, Jason",0.0367,0.005,7.931,0.000,0.028,0.046
"player_name_Adams, Austin",0.0566,0.011,5.346,0.000,0.036,0.077
"player_name_Adams, Chance",-0.0125,0.006,-2.174,0.030,-0.024,-0.001
"player_name_Adrianza, Ehire",-0.2194,0.020,-11.072,0.000,-0.258,-0.181
"player_name_Akin, Keegan",-0.0227,0.004,-6.311,0.000,-0.030,-0.016

0,1,2,3
Omnibus:,3344.024,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23629.012
Skew:,-0.354,Prob(JB):,0.0
Kurtosis:,7.449,Cond. No.,3.08e+18


Variance Inflation Factor

In [None]:
# Checking for the VIF values of the variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Creating a dataframe that will contain the names of all the feature variables and their VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Dropping Variables and Updating Model