In [1]:
import numpy as np
import pandas as pd

## Data

In [2]:
pitcher_train = pd.read_csv("pitcher_train.csv", index_col=0)
pitcher_test = pd.read_csv("pitcher_test.csv", index_col=0)

In [3]:
pitcher_train = pitcher_train[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9","INN2","ER"]]
pitcher_test = pitcher_test[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9"]]

In [4]:
pitcher_train.head()

Unnamed: 0_level_0,TB_SC,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,INN2,ER
GDAY_DS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-05-08,T,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.277778,3.6,1.8,19,2
2020-05-15,B,1.0,7.0,2.0,0.0,0.0,0.0,0.0,0.333333,2.842105,0.0,16,8
2020-05-21,B,2.0,7.0,1.0,0.0,1.0,0.0,0.0,0.380952,3.375,3.375,15,5
2020-05-28,B,1.0,6.0,1.0,0.0,2.0,1.0,0.0,0.368421,3.6,1.8,21,0
2020-06-03,B,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.1,2.571429,3.857143,15,3


In [5]:
pitcher_test.head()

Unnamed: 0_level_0,TB_SC,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,T,2,4,1,0,0,0.0,0,0.294118,6.0,3.0
1,B,4,0,0,0,1,0.0,0,0.0,0.0,54.0
0,T,2,8,1,1,1,0.0,1,0.526316,11.571429,3.857143
1,B,5,4,0,0,0,0.0,0,0.2,5.4,6.75
0,T,0,0,0,0,0,0.0,0,0.0,0.0,0.0


## Modeling

In [6]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

In [7]:
print(pitcher_train.dtypes)
print(pitcher_test.dtypes)

TB_SC     object
PA-AB    float64
H1       float64
H2       float64
H3       float64
HR       float64
SB_SR    float64
WP       float64
BABIP    float64
KK9      float64
BB9      float64
INN2       int64
ER         int64
dtype: object
TB_SC     object
PA-AB      int64
H1         int64
H2         int64
H3         int64
HR         int64
SB_SR    float64
WP         int64
BABIP    float64
KK9      float64
BB9      float64
dtype: object


In [8]:
pitcher_train['TB_SC'] = pitcher_train['TB_SC'].astype('category')
pitcher_test['TB_SC'] = pitcher_test['TB_SC'].astype('category')

print(pitcher_train.dtypes)
print(pitcher_test.dtypes)

TB_SC    category
PA-AB     float64
H1        float64
H2        float64
H3        float64
HR        float64
SB_SR     float64
WP        float64
BABIP     float64
KK9       float64
BB9       float64
INN2        int64
ER          int64
dtype: object
TB_SC    category
PA-AB       int64
H1          int64
H2          int64
H3          int64
HR          int64
SB_SR     float64
WP          int64
BABIP     float64
KK9       float64
BB9       float64
dtype: object


### INN2

In [9]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 11)
(8190, 11)
(27298, 11)


In [10]:
X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
X_val.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_val.columns]

In [11]:
#더미코딩
pitcher_train = pd.get_dummies(pitcher_train)
pitcher_train.head()

Unnamed: 0_level_0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,INN2,ER,TB_SC_B,TB_SC_T
GDAY_DS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-05-08,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.277778,3.6,1.8,19,2,0,1
2020-05-15,1.0,7.0,2.0,0.0,0.0,0.0,0.0,0.333333,2.842105,0.0,16,8,1,0
2020-05-21,2.0,7.0,1.0,0.0,1.0,0.0,0.0,0.380952,3.375,3.375,15,5,1,0
2020-05-28,1.0,6.0,1.0,0.0,2.0,1.0,0.0,0.368421,3.6,1.8,21,0,1,0
2020-06-03,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.1,2.571429,3.857143,15,3,1,0


In [12]:
pitcher_test = pd.get_dummies(pitcher_test)
pitcher_test.head()

Unnamed: 0_level_0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,TB_SC_B,TB_SC_T
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2,4,1,0,0,0.0,0,0.294118,6.0,3.0,0,1
1,4,0,0,0,1,0.0,0,0.0,0.0,54.0,1,0
0,2,8,1,1,1,0.0,1,0.526316,11.571429,3.857143,0,1
1,5,4,0,0,0,0.0,0,0.2,5.4,6.75,1,0
0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,1


In [13]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [14]:
lgb = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01, lambda_l1=1.5, lambda_l2=0, 
                   min_data_in_leaf=300, num_leaves=50, reg_alpha=0.1)

lgb.fit(X_train,y_train)



LGBMRegressor(lambda_l1=1.5, lambda_l2=0, learning_rate=0.01,
              min_data_in_leaf=300, num_boost_round=2000, num_leaves=50,
              reg_alpha=0.1)

In [15]:
INN2_predict = lgb.predict(X_val)

In [16]:
mse_inn2_lgb = mean_squared_error(y_val, INN2_predict)
mse_inn2_lgb

12.50284231058004

### ER

In [17]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['ER']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [18]:
lgb = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01,
                     lambda_l1=1.5, lambda_l2=1, min_data_in_leaf=400, num_leaves=30, reg_alpha=0.1)

lgb.fit(X_train, y_train)



LGBMRegressor(lambda_l1=1.5, lambda_l2=1, learning_rate=0.01,
              min_data_in_leaf=400, num_boost_round=2000, num_leaves=30,
              reg_alpha=0.1)

In [20]:
ER_predict = lgb.predict(X_val)

In [21]:
mse_er_lgb = mean_squared_error(y_val, ER_predict)
mse_er_lgb

2.0221188524575013

In [22]:
(mse_inn2_lgb+mse_er_lgb)/2

7.26248058151877