In [None]:
import numpy as np
import pandas as pd
import datetime

## 1. 데이터프레임 변형
- 해당 날짜의 x값은 그대로 사용하되, y값은 다음 경기의 값으로 변형
- test data는 각 선수 별 가장 최근 경기의 x값으로 구성

### 기존 데이터프레임

In [None]:
batter = pd.read_csv('data/private_batter.csv')
batter = batter.sort_values(by=['P_ID','GDAY_DS'])
batter['GDAY_DS'] = pd.to_datetime(batter['GDAY_DS'],format='%Y-%m-%d')

In [None]:
pitcher = pd.read_csv('data/private_pitcher.csv')
pitcher = pitcher.sort_values(by=['P_ID','GDAY_DS'])
pitcher['GDAY_DS'] = pd.to_datetime(pitcher['GDAY_DS'],format='%Y-%m-%d')

### 타자 데이터프레임 변형

- train 데이터

In [None]:
df=pd.DataFrame()
for i in batter.P_ID.unique():
    temp = batter[batter['P_ID']==i].drop('AVG',axis=1)
    temp = temp.reset_index(drop='index')
    future_y = temp[['AB','HIT']]
    x = temp.drop(['AB','HIT'],axis=1)
    x = x.shift(periods=1,axis=0)
    new = pd.concat([x,future_y],axis=1)
    new = new.drop(0)
    df = pd.concat([df,new])

In [None]:
df = df.reset_index(drop='index')

In [None]:
df.to_csv('data/batter_train.csv',index=False)

- test x

In [None]:
final_x = pd.DataFrame()
for i in batter.P_ID.unique():
    temp = batter[batter['P_ID']==i].drop(['AVG','AB','HIT'],axis=1).tail(1)
    final_x = pd.concat([final_x,temp])

In [None]:
final_x = final_x[final_x['GDAY_DS'].dt.year>=2020]
final_x = final_x.reset_index(drop='index')
final_x.to_csv('data/batter_test.csv',index=False)

### 투수 데이터프레임 변형

- train 데이터

In [None]:
df=pd.DataFrame()
for i in pitcher.P_ID.unique():
    temp = pitcher[pitcher['P_ID']==i].drop('ERA',axis=1)
    temp = temp.reset_index(drop='index')
    future_y = temp[['INN2','ER']]
    x = temp.drop(['INN2','ER'],axis=1)
    x = x.shift(periods=1,axis=0)
    new = pd.concat([x,future_y],axis=1)
    new = new.drop(0)
    df = pd.concat([df,new])

In [None]:
df = df.reset_index(drop='index')
df.to_csv('data/pitcher_train.csv',index=False)

- test x 

In [None]:
final_x = pd.DataFrame()
for i in pitcher.P_ID.unique():
    temp = pitcher[pitcher['P_ID']==i].drop(['INN2','ER','ERA'],axis=1).tail(1)
    final_x = pd.concat([final_x,temp])

In [None]:
final_x = final_x[final_x['GDAY_DS'].dt.year>=2020]

In [None]:
final_x = final_x.reset_index(drop='index')
final_x.to_csv('data/pitcher_test.csv',index=False)

## 2.투수: LGBM
- y값은 ER INN2

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

In [None]:
pitcher_train = pd.read_csv("data/pitcher_train.csv")
pitcher_test = pd.read_csv("data/pitcher_test.csv")

In [None]:
pitcher_team = pitcher_test['T_ID']

In [None]:
pitcher_train = pitcher_train.drop(columns = ['GDAY_DS','T_ID','P_ID','BF','AB','HIT','KK','SLG'])
pitcher_test = pitcher_test.drop(columns=['GDAY_DS','P_ID','BF','AB','HIT','KK','SLG','T_ID'])

In [None]:
cat_features = ['TB_SC']
pitcher_train[cat_features] = pitcher_train[cat_features].astype('category')
pitcher_test[cat_features] = pitcher_test[cat_features].astype('category')

In [None]:
pitcher_train = pd.get_dummies(pitcher_train)
pitcher_test = pd.get_dummies(pitcher_test)

- ER 예측

In [None]:
X = pitcher_train.drop(columns = ['ER','INN2'])
y = pitcher_train['ER']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
lgb1 = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01,
                    lambda_l1 = 1.5,
                    lambda_l2 = 1,
                    min_data_in_leaf = 400,
                    num_leaves = 30,
                    reg_alpha = 0.1)

In [None]:
lgb1.fit(X_train,y_train)
y_pred = lgb1.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
ER = lgb1.predict(pitcher_test)

- INN2예측

In [None]:
X = pitcher_train.drop(columns = ['INN2','ER'])
y = pitcher_train['INN2']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
lgb2 = LGBMRegressor(boosting_type='gbdt', num_boost_round=2000, learning_rate=0.01,
                    lambda_l1 = 1.5,
                    lambda_l2 = 0,
                    min_data_in_leaf = 300,
                    num_leaves = 50,
                    reg_alpha = 0.1)

In [None]:
lgb2.fit(X_train,y_train)
y_pred = lgb2.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
INN2 = lgb2.predict(pitcher_test)

In [None]:
pitcher_test['INN2'] = INN2
pitcher_test['ER'] = np.round(ER,2)
pitcher_test['T_ID'] = pitcher_team

## 3.타자: XGB
- y값은 HIT AB

In [None]:
batter_train = pd.read_csv("data/batter_train.csv")
batter_test = pd.read_csv("data/batter_test.csv")

In [None]:
batter_team = batter_test['T_ID']

In [None]:
batter_train = batter_train.drop(columns=['GDAY_DS','T_ID','P_ID'])
batter_test = batter_test.drop(['GDAY_DS','T_ID','P_ID'],axis=1)

In [None]:
cat_features = ['TB_SC']
batter_train[cat_features] = batter_train[cat_features].astype('category')
batter_test[cat_features] = batter_test[cat_features].astype('category')

In [None]:
batter_train = pd.get_dummies(batter_train)
batter_test = pd.get_dummies(batter_test)

- AB

In [None]:
X= batter_train.drop(columns=['HIT','AB'])
y= batter_train['AB']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
xgb1 = XGBRegressor(colsample_bytree= 1, gamma= 2, learning_rate= 0.01, n_estimators= 500, subsample= 0.5)

In [None]:
xgb1.fit(X_train, y_train)
y_pred = xgb1.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
AB = xgb1.predict(batter_test)

- HIT

In [None]:
X= batter_train.drop(columns=['HIT','AB'])
y= batter_train['HIT']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 2020)

In [None]:
xgb2 = XGBRegressor(colsample_bytree= 0.7, gamma= 2, learning_rate= 0.01, n_estimators= 500, subsample= 0.5)

In [None]:
xgb2.fit(X_train, y_train)
y_pred = xgb2.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mse

In [None]:
HIT =  xgb2.predict(batter_test)

In [None]:
batter_test['AB'] = AB
batter_test['HIT'] = HIT
batter_test['T_ID'] = batter_team

## 4.최종적인 Y값 : AVG, ERA, 승률

In [None]:
#avg
hit = batter_test['HIT'].groupby(batter_test['T_ID']).sum()
ab = batter_test['AB'].groupby(batter_test['T_ID']).sum()
AVG = hit/ab

In [None]:
#era
inn2 = pitcher_test['INN2'].groupby(pitcher_test['T_ID']).sum()
er = pitcher_test['ER'].groupby(pitcher_test['T_ID']).sum()/3
ERA = er*9/inn2

In [None]:
#승률
batter_T = pd.read_csv('data/batter_T.csv')
pitcher_T = pd.read_csv('data/pitcher_T.csv')

In [None]:
run = batter_T['RUN'].groupby(batter_T['T_ID']).sum()
R = pitcher_T['R'].groupby(pitcher_T['T_ID']).sum()
WR = (run**2)/((run**2)+(R**2))

In [None]:
df = pd.DataFrame({'타율': AVG,
                          '방어율': ERA,
                          '승률': WR})

In [None]:
df