In [1]:
import numpy as np
import pandas as pd

## Data

In [2]:
pitcher_train = pd.read_csv("pitcher_train.csv", index_col=0)
pitcher_test = pd.read_csv("pitcher_test.csv", index_col=0)

In [3]:
pitcher_train = pitcher_train[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9","INN2","ER"]]
pitcher_test = pitcher_test[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9"]]

In [4]:
pitcher_train.head()

Unnamed: 0,TB_SC,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,INN2,ER
0,T,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.277778,3.6,1.8,19,2
1,B,1.0,7.0,2.0,0.0,0.0,0.0,0.0,0.333333,2.842105,0.0,16,8
2,B,2.0,7.0,1.0,0.0,1.0,0.0,0.0,0.380952,3.375,3.375,15,5
3,B,1.0,6.0,1.0,0.0,2.0,1.0,0.0,0.368421,3.6,1.8,21,0
4,B,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.1,2.571429,3.857143,15,3


In [5]:
pitcher_test.head()

Unnamed: 0,TB_SC,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9
0,T,2,4,1,0,0,0.0,0,0.294118,6.0,3.0
1,B,4,0,0,0,1,0.0,0,0.0,0.0,54.0
2,T,2,8,1,1,1,0.0,1,0.526316,11.571429,3.857143
3,B,5,4,0,0,0,0.0,0,0.2,5.4,6.75
4,T,0,0,0,0,0,0.0,0,0.0,0.0,0.0


## Modeling

In [6]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

In [7]:
print(pitcher_train.dtypes)
print(pitcher_test.dtypes)

TB_SC     object
PA-AB    float64
H1       float64
H2       float64
H3       float64
HR       float64
SB_SR    float64
WP       float64
BABIP    float64
KK9      float64
BB9      float64
INN2       int64
ER         int64
dtype: object
TB_SC     object
PA-AB      int64
H1         int64
H2         int64
H3         int64
HR         int64
SB_SR    float64
WP         int64
BABIP    float64
KK9      float64
BB9      float64
dtype: object


In [8]:
pitcher_train['TB_SC'] = pitcher_train['TB_SC'].astype('category')
pitcher_test['TB_SC'] = pitcher_test['TB_SC'].astype('category')

print(pitcher_train.dtypes)
print(pitcher_test.dtypes)

TB_SC    category
PA-AB     float64
H1        float64
H2        float64
H3        float64
HR        float64
SB_SR     float64
WP        float64
BABIP     float64
KK9       float64
BB9       float64
INN2        int64
ER          int64
dtype: object
TB_SC    category
PA-AB       int64
H1          int64
H2          int64
H3          int64
HR          int64
SB_SR     float64
WP          int64
BABIP     float64
KK9       float64
BB9       float64
dtype: object


### INN2

In [9]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 11)
(8190, 11)
(27298, 11)


In [10]:
X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
X_val.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_val.columns]

In [11]:
#더미코딩
pitcher_train = pd.get_dummies(pitcher_train)
pitcher_train.head()

Unnamed: 0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,INN2,ER,TB_SC_B,TB_SC_T
0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.277778,3.6,1.8,19,2,0,1
1,1.0,7.0,2.0,0.0,0.0,0.0,0.0,0.333333,2.842105,0.0,16,8,1,0
2,2.0,7.0,1.0,0.0,1.0,0.0,0.0,0.380952,3.375,3.375,15,5,1,0
3,1.0,6.0,1.0,0.0,2.0,1.0,0.0,0.368421,3.6,1.8,21,0,1,0
4,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.1,2.571429,3.857143,15,3,1,0


In [12]:
pitcher_test = pd.get_dummies(pitcher_test)
pitcher_test.head()

Unnamed: 0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,TB_SC_B,TB_SC_T
0,2,4,1,0,0,0.0,0,0.294118,6.0,3.0,0,1
1,4,0,0,0,1,0.0,0,0.0,0.0,54.0,1,0
2,2,8,1,1,1,0.0,1,0.526316,11.571429,3.857143,0,1
3,5,4,0,0,0,0.0,0,0.2,5.4,6.75,1,0
4,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,1


In [13]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [14]:
params_grid = {'num_leaves': [30, 50, 70],
               'reg_alpha': [0.1, 0.5],
               'min_data_in_leaf': [30, 50, 100, 300, 400],
               'lambda_l1': [0, 1, 1.5],
               'lambda_l2': [0, 1]}

In [15]:
lgb = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01)
lgb_grid = GridSearchCV(estimator=lgb,
                        param_grid=params_grid,
                        n_jobs=10,
                        verbose=3)
lgb_grid.fit(X_train,y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:  1.7min
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed: 13.6min
[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed: 33.5min
[Parallel(n_jobs=10)]: Done 492 tasks      | elapsed: 58.9min
[Parallel(n_jobs=10)]: Done 540 out of 540 | elapsed: 62.9min finished




GridSearchCV(cv=None, error_score='raise',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.01, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_boost_round=2000, num_leaves=31,
       objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0),
       fit_params=None, iid=True, n_jobs=10,
       param_grid={'num_leaves': [30, 50, 70], 'reg_alpha': [0.1, 0.5], 'min_data_in_leaf': [30, 50, 100, 300, 400], 'lambda_l1': [0, 1, 1.5], 'lambda_l2': [0, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [16]:
lgb_grid.best_params_

{'lambda_l1': 1.5,
 'lambda_l2': 0,
 'min_data_in_leaf': 300,
 'num_leaves': 50,
 'reg_alpha': 0.1}

In [18]:
lgb1 = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01,
                     lambda_l1=1.5, lambda_l2=0, min_data_in_leaf=300, num_leaves=30, reg_alpha=0.1)

lgb1.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', lambda_l1=1.5, lambda_l2=0,
       learning_rate=0.01, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=300, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_boost_round=2000, num_leaves=30,
       objective=None, random_state=None, reg_alpha=0.1, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0)

In [19]:
INN2_predict = lgb1.predict(X_val)

In [20]:
mse = mean_squared_error(y_val, INN2_predict)
mse

12.506462518476795

In [21]:
INN2 = lgb1.predict(pitcher_test)

### ER

In [22]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['ER']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [23]:
params_grid = {'num_leaves': [30, 50, 70],
               'reg_alpha': [0.1, 0.5],
               'min_data_in_leaf': [30, 50, 100, 300, 400],
               'lambda_l1': [0, 1, 1.5],
               'lambda_l2': [0, 1]}

In [24]:
lgb = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01)
lgb_grid = GridSearchCV(estimator=lgb,
                        param_grid=params_grid,
                        n_jobs=10,
                        verbose=3)
lgb_grid.fit(X_train,y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:  1.6min
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed: 11.3min
[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed: 27.5min
[Parallel(n_jobs=10)]: Done 492 tasks      | elapsed: 50.6min
[Parallel(n_jobs=10)]: Done 540 out of 540 | elapsed: 54.8min finished




GridSearchCV(cv=None, error_score='raise',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.01, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_boost_round=2000, num_leaves=31,
       objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0),
       fit_params=None, iid=True, n_jobs=10,
       param_grid={'num_leaves': [30, 50, 70], 'reg_alpha': [0.1, 0.5], 'min_data_in_leaf': [30, 50, 100, 300, 400], 'lambda_l1': [0, 1, 1.5], 'lambda_l2': [0, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [25]:
lgb_grid.best_params_

{'lambda_l1': 1.5,
 'lambda_l2': 1,
 'min_data_in_leaf': 400,
 'num_leaves': 30,
 'reg_alpha': 0.1}

In [26]:
lgb2 = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01,
                     lambda_l1=1.5, lambda_l2=1, min_data_in_leaf=400, num_leaves=30, reg_alpha=0.1)

lgb2.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', lambda_l1=1.5, lambda_l2=1,
       learning_rate=0.01, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=400, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_boost_round=2000, num_leaves=30,
       objective=None, random_state=None, reg_alpha=0.1, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0)

In [27]:
ER_predict = lgb2.predict(X_val)

In [28]:
mse = mean_squared_error(y_val, ER_predict)
mse

2.0221188524575013

In [29]:
ER = lgb2.predict(pitcher_test)

## to CSV

In [30]:
#변수제거 안한 파일 다시 불러오기
pitcher_test = pd.read_csv("pitcher_test.csv", index_col=0)

In [31]:
pitcher_test['INN2'] = INN2
pitcher_test['ER'] = ER
pitcher_test

Unnamed: 0,B,T,GDAY_DS,T_ID,P_ID,TB_SC,BF,PA-AB,AB,HIT,...,HR,SB_SR,KK,WP,SLG,BABIP,KK9,BB9,INN2,ER
0,0,1,2020-07-17,KT,50030,T,91,2,21,5,...,0,0.000000,4,0,1,0.294118,6.000000,3.000000,13.151911,2.410265
1,1,0,2020-06-10,KT,50036,B,19,4,3,1,...,1,0.000000,0,0,1,0.000000,0.000000,54.000000,5.765889,1.244019
2,0,1,2020-07-19,KT,50040,T,123,2,26,11,...,1,0.000000,6,1,3,0.526316,11.571429,3.857143,18.849969,3.120809
3,1,0,2020-07-11,LG,50126,B,108,5,24,4,...,0,0.000000,4,0,0,0.200000,5.400000,6.750000,15.912857,3.158264
4,0,1,2020-07-07,LG,50157,T,1,0,1,0,...,0,0.000000,0,0,0,0.000000,0.000000,0.000000,3.114241,0.583839
5,1,0,2020-07-16,OB,50234,B,17,1,4,1,...,0,1.000000,2,0,0,0.500000,18.000000,9.000000,3.224664,0.658439
6,0,1,2020-06-25,OB,50296,T,29,3,3,0,...,0,0.000000,0,0,0,0.000000,0.000000,27.000000,3.247115,0.366306
7,1,0,2020-07-16,WO,50360,B,37,0,8,4,...,0,0.000000,3,1,1,0.800000,20.250000,0.000000,4.533778,1.049773
8,1,0,2020-07-19,SS,50404,B,112,2,26,6,...,1,0.000000,7,0,2,0.277778,7.875000,1.125000,17.858830,2.620910
9,1,0,2020-05-23,SS,50441,B,50,5,11,8,...,1,0.000000,3,0,3,0.875000,20.250000,27.000000,14.303717,2.500452


In [32]:
pitcher_test.to_csv('new_pitcher_report_lgbm.csv', index=False)

## ERA

In [33]:
er_by_team = pitcher_test['ER'].groupby(pitcher_test['T_ID']).sum()

er_by_team

T_ID
HH    33.256492
HT    28.784782
KT    28.343438
LG    27.350454
LT    26.694677
NC    27.710471
OB    27.172951
SK    26.758418
SS    31.468760
WO    26.970552
Name: ER, dtype: float64

In [34]:
inning_by_team = pitcher_test['INN2'].groupby(pitcher_test['T_ID']).sum()

inning_by_team

T_ID
HH    180.837010
HT    150.255172
KT    166.014934
LG    153.771406
LT    153.587282
NC    157.197653
OB    149.393921
SK    147.627932
SS    174.778385
WO    150.762456
Name: INN2, dtype: float64

In [35]:
era_by_team = er_by_team / inning_by_team * 27

era_by_team

T_ID
HH    4.965384
HT    5.172462
KT    4.609663
LG    4.802338
LT    4.692812
NC    4.759503
OB    4.910974
SK    4.893906
SS    4.861336
WO    4.830148
dtype: float64