In [None]:
import numpy as np
import pandas as pd
import datetime

## 1. 데이터프레임 변형
- 해당 날짜의 x값은 그대로 사용하되, y값은 다음 경기의 값으로 변형
- test data는 각 선수 별 가장 최근 경기의 x값으로 구성

### 기존 데이터프레임

In [None]:
batter = pd.read_csv('data/private_batter.csv')
batter = batter.sort_values(by=['P_ID','GDAY_DS'])
batter['GDAY_DS'] = pd.to_datetime(batter['GDAY_DS'],format='%Y-%m-%d')

In [None]:
pitcher = pd.read_csv('data/private_pitcher.csv')
pitcher = pitcher.sort_values(by=['P_ID','GDAY_DS'])
pitcher['GDAY_DS'] = pd.to_datetime(pitcher['GDAY_DS'],format='%Y-%m-%d')

### 타자 데이터프레임 변형

- train 데이터

In [None]:
df=pd.DataFrame()
for i in batter.P_ID.unique():
    temp = batter[batter['P_ID']==i].drop('AVG',axis=1)
    temp = temp.reset_index(drop='index')
    future_y = temp[['AB','HIT']]
    x = temp.drop(['AB','HIT'],axis=1)
    x = x.shift(periods=1,axis=0)
    new = pd.concat([x,future_y],axis=1)
    new = new.drop(0)
    df = pd.concat([df,new])

In [None]:
df = df.reset_index(drop='index')

In [None]:
df.to_csv('data/batter_train.csv')

- test x

In [None]:
final_x = pd.DataFrame()
for i in batter.P_ID.unique():
    temp = batter[batter['P_ID']==i].drop(['AVG','AB','HIT'],axis=1).tail(1)
    final_x = pd.concat([final_x,temp])

In [None]:
final_x = final_x[final_x['GDAY_DS'].dt.year>=2020]
df = pd.concat([pd.get_dummies(df['TB_SC']),df],axis=1)
final_x = pd.concat([pd.get_dummies(final_x['TB_SC']),final_x],axis=1)

In [None]:
final_x = final_x.reset_index(drop='index')
final_x.to_csv('data/batter_test.csv')

### 투수 데이터프레임 변형

- train 데이터

In [None]:
df=pd.DataFrame()
for i in pitcher.P_ID.unique():
    temp = pitcher[pitcher['P_ID']==i].drop('ERA',axis=1)
    temp = temp.reset_index(drop='index')
    future_y = temp[['INN2','ER']]
    x = temp.drop(['INN2','ER'],axis=1)
    x = x.shift(periods=1,axis=0)
    new = pd.concat([x,future_y],axis=1)
    new = new.drop(0)
    df = pd.concat([df,new])

In [None]:
df = df.reset_index(drop='index')
df.to_csv('data/pitcher_train.csv')

- test x 

In [None]:
final_x = pd.DataFrame()
for i in pitcher.P_ID.unique():
    temp = pitcher[pitcher['P_ID']==i].drop(['INN2','ER','ERA'],axis=1).tail(1)
    final_x = pd.concat([final_x,temp])

In [None]:
final_x = final_x[final_x['GDAY_DS'].dt.year>=2020]
df = pd.concat([pd.get_dummies(df['TB_SC']),df],axis=1)
final_x = pd.concat([pd.get_dummies(final_x['TB_SC']),final_x],axis=1)

In [None]:
final_x = final_x.reset_index(drop='index')
final_x.to_csv('data/pitcher_test.csv')

## 2.투수: LGBM
- y값은 ER INN2

In [None]:
pitcher_train = pd.read_csv("data/pitcher_train.csv")
pitcher_test = pd.read_csv("data/pitcher_test.csv")

In [None]:
pitcher_train=pitcher_train[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9","INN2","ER"]]
pitcher_test=pitcher_test[["TB_SC","PA-AB","H1","H2","H3","HR","SB_SR","WP","BABIP","KK9","BB9"]]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import lightgbm
from lightgbm import LGBMRegressor
from math import sqrt

In [None]:
cat_features = ['TB_SC']
pitcher_train[cat_features] = pitcher_train[cat_features].astype('category')
pitcher_test[cat_features] = pitcher_test[cat_features].astype('category')

- ER 예측

In [None]:
X = pitcher_train.drop(columns = ['ER','INN2','TB_SC'])
y = pitcher_train['ER']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
X_val.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_val.columns]

In [None]:
params_grid = {
    'num_leaves': [30, 50, 70],
    'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1]
    }
lgb = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01)
lgb_grid = GridSearchCV(estimator=lgb,
                        param_grid=params_grid,
                        n_jobs=10,
                        verbose=3)
lgb_grid.fit(X_train,y_train)

In [None]:
y_pred = lgb_grid.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
ER = lgb_grid.predict(pitcher_test)
pitcher_test['ER'] = np.round(ER,2)

- INN2예측

In [None]:
X = pitcher_train.drop(columns = ['INN2','ER','TB_SC'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
lgb = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01)
lgb_grid = GridSearchCV(estimator=lgb,
                        param_grid=params_grid,
                        n_jobs=10,
                        verbose=3)
lgb_grid.fit(X_train,y_train)

In [None]:
y_pred = lgb_grid.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
INN2 = lgb_grid.predict(pitcher_test)
pitcher_test['INN2'] = INN2

## 3.타자: XGB
- y값은 HIT AB

In [None]:
batter_train = pd.read_csv("data/batter_train.csv")
batter_test = pd.read_csv("data/batter_test.csv")
batter_test = batter_test.drop(['GDAY_DS','T_ID','P_ID','TB_SC'],axis=1)

- AB

In [None]:
X= batter_train.drop(columns=['GDAY_DS','T_ID','P_ID','HIT','AB','TB_SC'])
y= batter_train['AB']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
colsample_bytree = [0.5, 0.7, 1]
subsample = [0, 0.5, 1]
gamma = [0, 2]
learning_rate = [0.01, 0.1]
n_estimators = [300, 400, 500]

param_grid = dict(learning_rate = learning_rate,
                  n_estimators=n_estimators, colsample_bytree = colsample_bytree, subsample = subsample, gamma = gamma)

In [None]:
xgb_model = XGBRegressor()
grid_search = GridSearchCV(xgb_model, param_grid, scoring= "neg_mean_squared_error")
grid_result = grid_search.fit(X_train, y_train)

print(grid_result.best_score_)
print(grid_result.best_params_)

In [None]:
xgb1 = XGBRegressor(colsample_bytree= 1, gamma= 2, learning_rate= 0.01, n_estimators= 500, subsample= 0.5)
xgb1.fit(X_train, y_train)

In [None]:
y_pred = xgb1.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
AB = xgb1.predict(batter_test)
batter_test['AB'] = AB

- HIT

In [None]:
X= df.drop(columns=['GDAY_DS','T_ID','P_ID','HIT','AB','TB_SC'])
y= df['HIT']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
xgb_model = XGBRegressor()
grid_search = GridSearchCV(xgb_model, param_grid, scoring= "neg_mean_squared_error")
grid_result = grid_search.fit(X_train, y_train)

print(grid_result.best_score_)
print(grid_result.best_params_)

In [None]:
xgb2 = XGBRegressor(colsample_bytree= 0.7, gamma= 2, learning_rate= 0.01, n_estimators= 500, subsample= 0.5)
xgb2.fit(X_train, y_train)

In [None]:
y_pred = xgb2.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
HIT =  xgb2.predict(batter_test)
batter_test['HIT'] = HIT

## 4.최종적인 Y값 : AVG, ERA, 승률

In [None]:
#avg
hit = batter_test['HIT'].groupby(batter_test['T_ID']).sum()
ab = batter_test['AB'].groupby(batter_test['T_ID']).sum()
AVG = hit/ab

In [None]:
#era
inn2 = pitcher_test['INN2'].groupby(pitcher_test['T_ID']).sum()
er = pitcher_test['ER'].groupby(pitcher_test['T_ID']).sum()/3
ERA = er/inn2*9

In [None]:
#승률
batter_T = pd.read_csv('data/batter_T.csv')
pitcher_T = pd.read_csv('data/pitcher_T.csv')

In [None]:
run = batter_T['RUN'].groupby(batter_T['T_ID']).sum()
R = pitcher_T['R'].groupby(pitcer_T['T_ID']).sum()
WR = (run**2)/((run**2)+(R**2))

In [None]:
data = pd.DataFrame({'타율': AVG,
                          '방어율': ERA,
                          '승률': WR})
data