In [1]:
import numpy as np
import pandas as pd

## Data

In [2]:
pitcher_train = pd.read_csv("pitcher_train.csv", index_col=0)
pitcher_test = pd.read_csv("pitcher_test.csv", index_col=0)

In [3]:
pitcher_train=pitcher_train[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9","INN2","ER"]]
pitcher_test=pitcher_test[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9"]]

In [4]:
pitcher_train.head()

Unnamed: 0,TB_SC,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,INN2,ER
0,T,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.277778,3.6,1.8,19,2
1,B,1.0,7.0,2.0,0.0,0.0,0.0,0.0,0.333333,2.842105,0.0,16,8
2,B,2.0,7.0,1.0,0.0,1.0,0.0,0.0,0.380952,3.375,3.375,15,5
3,B,1.0,6.0,1.0,0.0,2.0,1.0,0.0,0.368421,3.6,1.8,21,0
4,B,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.1,2.571429,3.857143,15,3


In [5]:
pitcher_test.head()

Unnamed: 0,TB_SC,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9
0,T,2,4,1,0,0,0.0,0,0.294118,6.0,3.0
1,B,4,0,0,0,1,0.0,0,0.0,0.0,54.0
2,T,2,8,1,1,1,0.0,1,0.526316,11.571429,3.857143
3,B,5,4,0,0,0,0.0,0,0.2,5.4,6.75
4,T,0,0,0,0,0,0.0,0,0.0,0.0,0.0


## Modeling

In [6]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

In [7]:
print(pitcher_train.dtypes)
print(pitcher_test.dtypes)

TB_SC     object
PA-AB    float64
H1       float64
H2       float64
H3       float64
HR       float64
SB_SR    float64
WP       float64
BABIP    float64
KK9      float64
BB9      float64
INN2       int64
ER         int64
dtype: object
TB_SC     object
PA-AB      int64
H1         int64
H2         int64
H3         int64
HR         int64
SB_SR    float64
WP         int64
BABIP    float64
KK9      float64
BB9      float64
dtype: object


In [8]:
pitcher_train['TB_SC'] = pitcher_train['TB_SC'].astype('category')
pitcher_test['TB_SC'] = pitcher_test['TB_SC'].astype('category')

print(pitcher_train.dtypes)
print(pitcher_test.dtypes)

TB_SC    category
PA-AB     float64
H1        float64
H2        float64
H3        float64
HR        float64
SB_SR     float64
WP        float64
BABIP     float64
KK9       float64
BB9       float64
INN2        int64
ER          int64
dtype: object
TB_SC    category
PA-AB       int64
H1          int64
H2          int64
H3          int64
HR          int64
SB_SR     float64
WP          int64
BABIP     float64
KK9       float64
BB9       float64
dtype: object


### INN2

In [9]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 11)
(8190, 11)
(27298, 11)


In [10]:
X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
X_val.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_val.columns]

In [11]:
#더미코딩
pitcher_train = pd.get_dummies(pitcher_train)
pitcher_train.head()

Unnamed: 0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,INN2,ER,TB_SC_B,TB_SC_T
0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.277778,3.6,1.8,19,2,0,1
1,1.0,7.0,2.0,0.0,0.0,0.0,0.0,0.333333,2.842105,0.0,16,8,1,0
2,2.0,7.0,1.0,0.0,1.0,0.0,0.0,0.380952,3.375,3.375,15,5,1,0
3,1.0,6.0,1.0,0.0,2.0,1.0,0.0,0.368421,3.6,1.8,21,0,1,0
4,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.1,2.571429,3.857143,15,3,1,0


In [12]:
pitcher_test = pd.get_dummies(pitcher_test)
pitcher_test.head()

Unnamed: 0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,TB_SC_B,TB_SC_T
0,2,4,1,0,0,0.0,0,0.294118,6.0,3.0,0,1
1,4,0,0,0,1,0.0,0,0.0,0.0,54.0,1,0
2,2,8,1,1,1,0.0,1,0.526316,11.571429,3.857143,0,1
3,5,4,0,0,0,0.0,0,0.2,5.4,6.75,1,0
4,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,1


In [13]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [14]:
colsample_bytree = [0.5, 0.7, 1]
subsample = [0, 0.5, 1]
gamma = [0, 2]
learning_rate = [0.01, 0.1]
n_estimators = [300, 400, 500]

param_grid = dict(learning_rate=learning_rate,
                  n_estimators=n_estimators,
                  colsample_bytree=colsample_bytree,
                  subsample=subsample,
                  gamma = gamma)

In [15]:
xgb_model = XGBRegressor()
                                
print("GRID SEARCH START")
grid_search = GridSearchCV(xgb_model, param_grid, scoring='neg_mean_squared_error')
grid_result = grid_search.fit(X_train, y_train)

GRID SEARCH START










In [16]:
print(grid_result.best_score_)

print(grid_result.best_params_)

print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = -grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

-12.791603492563464
{'colsample_bytree': 0.7, 'gamma': 2, 'learning_rate': 0.1, 'n_estimators': 400, 'subsample': 0.5}

Best: -12.791603 using {'colsample_bytree': 0.7, 'gamma': 2, 'learning_rate': 0.1, 'n_estimators': 400, 'subsample': 0.5}


In [17]:
xgb1 = XGBRegressor(colsample_bytree=0.7, gamma=2, learning_rate=0.1, n_estimators=400, subsample=0.5)
xgb1.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=2,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=400,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.5, verbosity=1)

In [18]:
INN2_predict = xgb1.predict(X_val)

In [19]:
mse = mean_squared_error(y_val, INN2_predict)
mse

12.528202888516006

In [20]:
INN2 = xgb1.predict(pitcher_test)

### ER

In [21]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['ER']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [22]:
colsample_bytree = [0.5, 0.7, 1]
subsample = [0, 0.5, 1]
gamma = [0, 2]
learning_rate = [0.01, 0.1]
n_estimators = [300, 400, 500]

param_grid = dict(learning_rate=learning_rate,
                  n_estimators=n_estimators,
                  colsample_bytree=colsample_bytree,
                  subsample=subsample,
                  gamma = gamma)

In [23]:
xgb_model = XGBRegressor()
                                
print("GRID SEARCH START")
grid_search = GridSearchCV(xgb_model, param_grid, scoring='neg_mean_squared_error')
grid_result = grid_search.fit(X_train, y_train)

GRID SEARCH START










In [24]:
print(grid_result.best_score_)

print(grid_result.best_params_)

print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = -grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

-2.0439954294952223
{'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}

Best: -2.043995 using {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}


In [25]:
xgb2 = XGBRegressor(colsample_bytree=0.7, gamma=0, learning_rate=0.01, n_estimators=500, subsample=0.5)
xgb2.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0,
       importance_type='gain', learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.5, verbosity=1)

In [26]:
ER_predict = xgb2.predict(X_val)

In [27]:
mse = mean_squared_error(y_val, ER_predict)
mse

2.0377408563623396

In [28]:
ER = xgb2.predict(pitcher_test)

## to CSV

In [30]:
#변수제거 안한 파일 다시 불러오기
pitcher_test = pd.read_csv("pitcher_test.csv", index_col=0)

In [31]:
pitcher_test['INN2'] = INN2
pitcher_test['ER'] = ER
pitcher_test

Unnamed: 0,B,T,GDAY_DS,T_ID,P_ID,TB_SC,BF,PA-AB,AB,HIT,...,HR,SB_SR,KK,WP,SLG,BABIP,KK9,BB9,INN2,ER
0,0,1,2020-07-17,KT,50030,T,91,2,21,5,...,0,0.000000,4,0,1,0.294118,6.000000,3.000000,14.755663,2.500365
1,1,0,2020-06-10,KT,50036,B,19,4,3,1,...,1,0.000000,0,0,1,0.000000,0.000000,54.000000,5.058250,1.052439
2,0,1,2020-07-19,KT,50040,T,123,2,26,11,...,1,0.000000,6,1,3,0.526316,11.571429,3.857143,18.770617,2.722901
3,1,0,2020-07-11,LG,50126,B,108,5,24,4,...,0,0.000000,4,0,0,0.200000,5.400000,6.750000,15.992447,2.971510
4,0,1,2020-07-07,LG,50157,T,1,0,1,0,...,0,0.000000,0,0,0,0.000000,0.000000,0.000000,3.128164,0.548309
5,1,0,2020-07-16,OB,50234,B,17,1,4,1,...,0,1.000000,2,0,0,0.500000,18.000000,9.000000,3.298314,0.617734
6,0,1,2020-06-25,OB,50296,T,29,3,3,0,...,0,0.000000,0,0,0,0.000000,0.000000,27.000000,3.924403,0.631606
7,1,0,2020-07-16,WO,50360,B,37,0,8,4,...,0,0.000000,3,1,1,0.800000,20.250000,0.000000,4.532493,0.906236
8,1,0,2020-07-19,SS,50404,B,112,2,26,6,...,1,0.000000,7,0,2,0.277778,7.875000,1.125000,18.406229,2.797925
9,1,0,2020-05-23,SS,50441,B,50,5,11,8,...,1,0.000000,3,0,3,0.875000,20.250000,27.000000,14.034451,2.233328


In [32]:
pitcher_test.to_csv('new_pitcher_report_xgb.csv', index=False)

## ERA

In [34]:
er_by_team = pitcher_test['ER'].groupby(pitcher_test['T_ID']).sum()

er_by_team

T_ID
HH    33.605850
HT    27.755098
KT    30.633045
LG    26.572374
LT    28.366077
NC    27.364338
OB    26.662821
SK    26.986055
SS    30.913033
WO    27.999994
Name: ER, dtype: float32

In [35]:
inning_by_team = pitcher_test['INN2'].groupby(pitcher_test['T_ID']).sum()

inning_by_team

T_ID
HH    181.539917
HT    148.624847
KT    165.622421
LG    151.001358
LT    154.213516
NC    154.303848
OB    152.321747
SK    144.968628
SS    172.237579
WO    153.730759
Name: INN2, dtype: float32

In [36]:
era_by_team = er_by_team / inning_by_team * 27

era_by_team

T_ID
HH    4.998118
HT    5.042142
KT    4.993842
LG    4.751309
LT    4.966387
NC    4.788197
OB    4.726155
SK    5.026077
SS    4.845933
WO    4.917688
dtype: float32