In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [27]:
## train

pitcher_train = pd.read_csv("pitcher_train.csv")
batter_train = pd.read_csv("batter_train.csv")

## test

pitcher_test = pd.read_csv("pitcher_test.csv", index_col  = 0)
batter_test = pd.read_csv("batter_test.csv", index_col  = 0)

In [28]:
print(pitcher_train.shape)
print(pitcher_test.shape)

print(batter_train.shape)
print(batter_test.shape)

(27298, 21)
(234, 20)
(80395, 13)
(299, 12)


## 변수제거

In [29]:
batter_train=batter_train[["TB_SC","PA-AB","RUN","RBI","SH+SF","KK","SB_trial","BABIP","AB","HIT"]]
batter_test=batter_test[["TB_SC","PA-AB","RUN","RBI","SH+SF","KK","SB_trial","BABIP"]]

pitcher_train=pitcher_train[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9","INN2","ER"]]
pitcher_test=pitcher_test[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9"]]

In [30]:
print(pitcher_train.shape)
print(pitcher_test.shape)

print(batter_train.shape)
print(batter_test.shape)

(27298, 13)
(234, 11)
(80395, 10)
(299, 8)


# Model Train

In [31]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

### 투수 - INN2

In [32]:
pitcher_train.dtypes

TB_SC     object
PA-AB    float64
H1       float64
H2       float64
H3       float64
HR       float64
SB_SR    float64
WP       float64
BABIP    float64
KK9      float64
BB9      float64
INN2       int64
ER         int64
dtype: object

In [33]:
cat_features = ['TB_SC']
pitcher_train[cat_features] = pitcher_train[cat_features].astype('category')

pitcher_train.dtypes

TB_SC    category
PA-AB     float64
H1        float64
H2        float64
H3        float64
HR        float64
SB_SR     float64
WP        float64
BABIP     float64
KK9       float64
BB9       float64
INN2        int64
ER          int64
dtype: object

In [34]:
pitcher_test[cat_features] = pitcher_test[cat_features].astype('category')

pitcher_test.dtypes

TB_SC    category
PA-AB       int64
H1          int64
H2          int64
H3          int64
HR          int64
SB_SR     float64
WP          int64
BABIP     float64
KK9       float64
BB9       float64
dtype: object

In [35]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)
print(X.shape)

(19108, 11)
(8190, 11)
(27298, 11)


In [36]:
X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
X_val.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_val.columns]

In [37]:
# same code again (not needed but doesn't bite)
cat_features = ['TB_SC']
pitcher_train[cat_features] = pitcher_train[cat_features].astype('category')

pitcher_train.dtypes

TB_SC    category
PA-AB     float64
H1        float64
H2        float64
H3        float64
HR        float64
SB_SR     float64
WP        float64
BABIP     float64
KK9       float64
BB9       float64
INN2        int64
ER          int64
dtype: object

In [38]:
#sklearn 을 이용한 random forest는 dummy변수화 해줘야 함.
pitcher_train = pd.get_dummies(pitcher_train)
pitcher_train.head()

Unnamed: 0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,INN2,ER,TB_SC_B,TB_SC_T
0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.277778,3.6,1.8,19,2,0,1
1,1.0,7.0,2.0,0.0,0.0,0.0,0.0,0.333333,2.842105,0.0,16,8,1,0
2,2.0,7.0,1.0,0.0,1.0,0.0,0.0,0.380952,3.375,3.375,15,5,1,0
3,1.0,6.0,1.0,0.0,2.0,1.0,0.0,0.368421,3.6,1.8,21,0,1,0
4,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.1,2.571429,3.857143,15,3,1,0


In [39]:
pitcher_test = pd.get_dummies(pitcher_test)
pitcher_test.head()

Unnamed: 0_level_0,PA-AB,H1,H2,H3,HR,SB_SR,WP,BABIP,KK9,BB9,TB_SC_B,TB_SC_T
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2,4,1,0,0,0.0,0,0.294118,6.0,3.0,0,1
1,4,0,0,0,1,0.0,0,0.0,0.0,54.0,1,0
0,2,8,1,1,1,0.0,1,0.526316,11.571429,3.857143,0,1
1,5,4,0,0,0,0.0,0,0.2,5.4,6.75,1,0
0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,1


In [40]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)

print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [41]:
estimator = RandomForestRegressor(bootstrap=True, max_depth=10, 
                                  max_features='auto', min_samples_leaf=5, 
                                  min_samples_split=2, n_estimators=1000, random_state=2020)

estimator.fit(X_train, y_train)

print(estimator.score(X_train, y_train))
print(estimator.score(X_val, y_val))

reg_prediction=estimator.predict(X_val)

0.7253847211427864
0.6585759417650962


In [42]:
mse_inn2_rf = mean_squared_error(y_val, reg_prediction)
mse_inn2_rf

12.708676002629474

## 투수-ER

In [43]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['ER']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

print(X_train.shape)
print(X_val.shape)

print(X.shape)

(19108, 12)
(8190, 12)
(27298, 12)


In [44]:
estimator = RandomForestRegressor(bootstrap=True, max_depth=10, 
                                  max_features='log2', min_samples_leaf=5, 
                                  min_samples_split=2, n_estimators=500, random_state=2020)

estimator.fit(X_train, y_train)

print(estimator.score(X_train, y_train))
print(estimator.score(X_val, y_val))

reg_prediction=estimator.predict(X_val)

0.33422752565426506
0.27746580384328023


In [45]:
mse_er_rf = mean_squared_error(y_val, reg_prediction)
mse_er_rf

2.019370668538266

In [46]:
(mse_inn2_rf + mse_er_rf)/2

7.36402333558387