In [6]:
import numpy as np
import pandas as pd
import datetime

## 기존 데이터프레임

In [7]:
batter = pd.read_csv('private_batter.csv')
batter = batter.sort_values(by=['P_ID','GDAY_DS'])

In [8]:
batter['GDAY_DS'] = pd.to_datetime(batter['GDAY_DS'],format='%Y-%m-%d')

In [9]:
batter[batter['P_ID']==50054].head()

Unnamed: 0,GDAY_DS,T_ID,P_ID,TB_SC,PA-AB,AB,RUN,RBI,HIT,SH+SF,KK,AVG,SB_trial,BABIP
76138,2020-06-03,KT,50054,B,0,3,0,0,2,0,0,0.666667,0,0.666667
76262,2020-06-04,KT,50054,B,1,3,1,0,0,0,1,0.0,0,0.0
76522,2020-06-06,KT,50054,T,0,0,0,0,0,0,0,0.0,0,0.0
76647,2020-06-07,KT,50054,T,0,3,0,0,1,0,2,0.333333,0,1.0
76902,2020-06-10,KT,50054,B,0,2,0,0,1,0,0,0.5,0,0.5


## 데이터프레임 변형: X값 그대로 Y값은 다음 경기의 값

In [10]:
df=pd.DataFrame()
for i in batter.P_ID.unique():
    temp = batter[batter['P_ID']==i].drop('AVG',axis=1)
    temp = temp.reset_index(drop='index')
    future_y = temp[['AB','HIT']]
    x = temp.drop(['AB','HIT'],axis=1)
    x = x.shift(periods=1,axis=0)
    new = pd.concat([x,future_y],axis=1)
    new = new.drop(0)
    df = pd.concat([df,new])

In [11]:
df = df.reset_index(drop='index')

In [12]:
df[df['P_ID']==50054].head()

Unnamed: 0,GDAY_DS,T_ID,P_ID,TB_SC,PA-AB,RUN,RBI,SH+SF,KK,SB_trial,BABIP,AB,HIT
0,2020-06-03,KT,50054.0,B,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,3,0
1,2020-06-04,KT,50054.0,B,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0
2,2020-06-06,KT,50054.0,T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1
3,2020-06-07,KT,50054.0,T,0.0,0.0,0.0,0.0,2.0,0.0,1.0,2,1
4,2020-06-10,KT,50054.0,B,0.0,0.0,0.0,0.0,0.0,0.0,0.5,4,3


## TEST SET 구성: 각 선수별 마지막 경기 X값

In [13]:
final_x = pd.DataFrame()
for i in batter.P_ID.unique():
    temp = batter[batter['P_ID']==i].drop(['AVG','AB','HIT'],axis=1).tail(1)
    final_x = pd.concat([final_x,temp])

In [14]:
final_x = final_x[final_x['GDAY_DS'].dt.year>=2020]

In [15]:
df = pd.concat([pd.get_dummies(df['TB_SC']),df],axis=1)
final_x = pd.concat([pd.get_dummies(final_x['TB_SC']),final_x],axis=1)

In [16]:
final_x[final_x['P_ID']==50054]

Unnamed: 0,B,T,GDAY_DS,T_ID,P_ID,TB_SC,PA-AB,RUN,RBI,SH+SF,KK,SB_trial,BABIP
81003,0,1,2020-07-19,KT,50054,T,0,0,0,0,1,0,0.0


In [17]:
test_x = final_x.drop(['GDAY_DS','T_ID','P_ID','TB_SC'],axis=1)

## 모델링

In [18]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

## AB예측

In [None]:
X= df.drop(columns=['GDAY_DS','T_ID','P_ID','HIT','AB','TB_SC','HIT'])
y= df['AB']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [19]:
colsample_bytree = [0.5, 0.7, 1]
subsample = [0, 0.5, 1]
gamma = [0, 2]
learning_rate = [0.01, 0.1]
n_estimators = [300, 400, 500]

param_grid = dict(learning_rate = learning_rate,
                  n_estimators=n_estimators, colsample_bytree = colsample_bytree, subsample = subsample, gamma = gamma)

In [22]:
xgb_model = XGBRegressor()
                                
print("GRID SEARCH START")
grid_search = GridSearchCV(xgb_model, param_grid, scoring= "neg_mean_squared_error")
grid_result = grid_search.fit(X_train, y_train)

GRID SEARCH START


In [23]:
print(grid_result.best_score_)

print(grid_result.best_params_)

print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = -grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

-2.0515226760294203
{'colsample_bytree': 1, 'gamma': 2, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}

Best: -2.051523 using {'colsample_bytree': 1, 'gamma': 2, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}


'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 15, 'min_samples_split': 2, 'n_estimators': 1000}

## AB예측

In [24]:
xgb1 = XGBRegressor(colsample_bytree= 1, gamma= 2, learning_rate= 0.01, n_estimators= 500, subsample= 0.5)
xgb1.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=2, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.5,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
AB_predict = xgb1.predict(X_val)

In [26]:
mse = mean_squared_error(y_val, AB_predict)
mse

2.040177141679443

In [27]:
AB = xgb1.predict(test_x)

## HIT 예측

In [40]:
X= df.drop(columns=['GDAY_DS','T_ID','P_ID','HIT','AB','TB_SC','HIT'])
y= df['HIT']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [41]:
colsample_bytree = [0.5, 0.7, 1]
subsample = [0, 0.5, 1]
gamma = [0, 2]
learning_rate = [0.01, 0.1]
n_estimators = [300, 400, 500]

param_grid = dict(learning_rate = learning_rate,
                  n_estimators=n_estimators, colsample_bytree = colsample_bytree, subsample = subsample, gamma = gamma)

In [42]:
xgb_model = XGBRegressor()
                                
print("GRID SEARCH START")
grid_search = GridSearchCV(xgb_model, param_grid, scoring= "neg_mean_squared_error")
grid_result = grid_search.fit(X_train, y_train)

GRID SEARCH START


In [43]:
print(grid_result.best_score_)

print(grid_result.best_params_)

print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = -grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

-0.7804069499112856
{'colsample_bytree': 0.7, 'gamma': 2, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}

Best: -0.780407 using {'colsample_bytree': 0.7, 'gamma': 2, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.5}


In [44]:
xgb2 = XGBRegressor(colsample_bytree= 1, gamma= 2, learning_rate= 0.01, n_estimators= 500, subsample= 0.5)
xgb2.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=2, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.5,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
HIT_predict = xgb2.predict(X_val)

In [46]:
mse = mean_squared_error(y_val, HIT_predict)
mse

0.79183382994687

In [47]:
HIT = xgb2.predict(test_x)

In [63]:
final_x['AB']=AB

In [64]:
final_x['HIT']=HIT

In [65]:
final_x.head()

Unnamed: 0,B,T,GDAY_DS,T_ID,P_ID,TB_SC,PA-AB,RUN,RBI,SH+SF,KK,SB_trial,BABIP,AB,HIT
81003,0,1,2020-07-19,KT,50054,T,0,0,0,0,1,0,0.0,2.314097,0.623417
81004,0,1,2020-07-19,KT,50066,T,0,0,0,0,0,0,0.0,1.821584,0.475197
80726,1,0,2020-07-17,LG,50150,B,0,0,0,0,2,0,0.0,2.708803,0.751718
80858,1,0,2020-07-18,LG,50165,B,0,0,0,0,2,0,0.0,2.708803,0.751718
81051,0,1,2020-07-19,OB,50202,T,0,0,0,0,0,0,0.0,1.821584,0.475197


## AVG 예측

In [66]:
hit_by_team = final_x['HIT'].groupby(final_x['T_ID']).sum()
hit_by_team

T_ID
HH    23.238651
HT    18.133801
KT    16.009571
LG    21.311625
LT    17.984184
NC    20.233149
OB    17.625916
SK    18.428888
SS    19.416904
WO    20.902374
Name: HIT, dtype: float32

In [67]:
AB_by_team = final_x['AB'].groupby(final_x['T_ID']).sum()
AB_by_team

T_ID
HH    84.899323
HT    64.913399
KT    59.125050
LG    76.692596
LT    64.951546
NC    71.702560
OB    64.859940
SK    67.529091
SS    70.143272
WO    76.090034
Name: AB, dtype: float32

In [68]:
AVG_by_team = hit_by_team / AB_by_team
AVG_by_team

T_ID
HH    0.273720
HT    0.279354
KT    0.270775
LG    0.277884
LT    0.276886
NC    0.282182
OB    0.271753
SK    0.272903
SS    0.276818
WO    0.274706
dtype: float32

## 참고: 실제 데이터로 구한 AVG

In [53]:
batter = batter[batter['GDAY_DS'].dt.year>=2020]

In [54]:
hit_2020 = batter['HIT'].groupby(batter['T_ID']).sum()
ab_2020 = batter['AB'].groupby(batter['T_ID']).sum()

In [55]:
avg_2020 = hit_2020/ab_2020
avg_2020

T_ID
HH    0.239796
HT    0.275632
KT    0.290933
LG    0.279059
LT    0.274238
NC    0.290121
OB    0.302407
SK    0.242661
SS    0.274465
WO    0.270916
dtype: float64

In [70]:
final_x.to_csv("report_data/new_batter_report_xgb.csv")