In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook
import time

  import pandas.util.testing as tm


In [2]:
pitcher = pd.read_csv('private_pitcher.csv',encoding='CP949')
batter = pd.read_csv('private_batter.csv', encoding='CP949')
batter['GDAY_DS'] = pd.to_datetime(batter['GDAY_DS'],format='%Y-%m-%d')
pitcher['GDAY_DS'] = pd.to_datetime(pitcher['GDAY_DS'],format='%Y-%m-%d')

# 1. 최적의 파라미터 찾기

1. 각각의 선수에 대한 mse를 구하고 평균 mse를 구해 그 값이 가장 적은 것을 최종 모델로 사용한다

2. 2020년 데이터를 제외하고 0.3 0.7의 비율로 train-test data를 나누어 grid-search 진행

3. train-data가 0인 경우가 발생; train-data가 10 이하인 경우(경기 출전 경험이 0 이하인 경우) 평균값을 예측값으로 사용

## 1) Simple Exponential smoothing

In [3]:
def x_variable(col_name):
    best_score = 100000000   
    best_level = 0
    for level in tqdm_notebook([0.1,0.2,0.4,0.6,0.8]):
        total_mse=0 
        avg_mse = 0
        for i in temp.P_ID.unique():
        
            df = temp[(temp['P_ID']==i)&(temp['GDAY_DS'].dt.year<2020)]
            df = df[['GDAY_DS',col_name]]
    
            train_num = int(round(df.count()[1]*0.7))
            train = df[0:train_num]
            test = df[train_num:]
    
            fit1 = SimpleExpSmoothing(np.array(train[col_name])).fit(smoothing_level=level)
            pred = fit1.forecast(len(test))
            pred = pd.DataFrame(pred)
    
            pred.index = test['GDAY_DS']
            pred= pred.rename(columns={0:'prediction'})
            result = pd.merge(test,pred,on='GDAY_DS')
            mse = mean_squared_error(result[col_name], result.prediction)
            total_mse = total_mse + mse

        avg_mse = total_mse/temp.P_ID.nunique()
        if avg_mse < best_score:
            best_score = avg_mse
            best_level = level
    print(col_name, '(First-method) level: ',best_level,' MSE: ',best_score)

## 2) holt's Exponential Smoothing

In [4]:
def x_variable2(col_name):
    best_score = 100000000    
    best_level = 0
    best_slope = 0
    for level in tqdm_notebook([0.1,0.2,0.4,0.6,0.8]):
        for slope in [0.1,0.2,0.4,0.6,0.8]:
            total_mse=0 
            avg_mse = 0
            for i in temp.P_ID.unique():
                time.sleep(0.01)
                
                df = temp[(temp['P_ID']==i)&(temp['GDAY_DS'].dt.year<2020)]
                df = df[['GDAY_DS',col_name]]
    
                train_num = int(round(df.count()[1]*0.7))
                train = df[0:train_num]
                test = df[train_num:]
                
                fit2 = Holt(np.array(train[col_name])).fit(smoothing_level=level, smoothing_slope=slope)
                pred = fit2.forecast(len(test))
                pred = pd.DataFrame(pred)
                
                pred.index = test['GDAY_DS']
                pred= pred.rename(columns={0:'prediction2'})
                result = pd.merge(test,pred,on='GDAY_DS')
                mse = mean_squared_error(result[col_name], result.prediction2)
                total_mse = total_mse + mse
                
            avg_mse = total_mse/temp.P_ID.nunique()
            if avg_mse < best_score:
                best_score = avg_mse
                best_level = level
                best_slope = slope
    print(col_name,'(Second-method) level: ',best_level,' slope: ',best_slope,' MSE: ',best_score)

# 2. 각 X변수 별 최적의 모델 찾기
## 1) 투수: ER ERA 제외하고 진행

In [5]:
# train data가 10이하인 case 제거하고 temp 데이터프레임 생성
temp = pitcher.head(1)
for i in pitcher.P_ID.unique():
    df = pitcher[(pitcher['P_ID']==i)&(pitcher['GDAY_DS'].dt.year<2020)]
    train_num = int(round(df.count()[1]*0.7))
    if train_num>=10:
         temp = temp.append(pitcher[pitcher['P_ID']==i])
col = np.array(temp.drop(['GDAY_DS','T_ID','P_ID','TB_SC','ER','ERA'],axis=1).columns) #우선 명목형 변수 제거함..

In [6]:
for col_name in col:
    x_variable(col_name)
    x_variable2(col_name)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


INN2 (First-method) level:  0.1  MSE:  17.0109963203083


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


INN2 (Second-method) level:  0.1  slope:  0.1  MSE:  28.80128514210149


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BF (First-method) level:  0.1  MSE:  454.0554706610214


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BF (Second-method) level:  0.1  slope:  0.1  MSE:  775.377867742487


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


PA-AB (First-method) level:  0.1  MSE:  1.5604772894001215


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


PA-AB (Second-method) level:  0.1  slope:  0.1  MSE:  2.1576556956510644


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


AB (First-method) level:  0.1  MSE:  26.205382658677387


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


AB (Second-method) level:  0.1  slope:  0.1  MSE:  46.26377853528826


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


HIT (First-method) level:  0.1  MSE:  4.183633929197449


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


HIT (Second-method) level:  0.1  slope:  0.1  MSE:  6.796043927888198


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


H1 (First-method) level:  0.1  MSE:  2.670825552144577


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


H1 (Second-method) level:  0.1  slope:  0.1  MSE:  4.146108955690702


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


H2 (First-method) level:  0.1  MSE:  0.5462202434223364


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


H2 (Second-method) level:  0.1  slope:  0.1  MSE:  0.727142093911874


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


H3 (First-method) level:  0.1  MSE:  0.039572833897152986


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


H3 (Second-method) level:  0.1  slope:  0.1  MSE:  0.05119622844358834


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


HR (First-method) level:  0.1  MSE:  0.306069454245509


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


HR (Second-method) level:  0.1  slope:  0.1  MSE:  0.43073847348895256


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SB_SR (First-method) level:  0.1  MSE:  0.10557364904711611


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SB_SR (Second-method) level:  0.1  slope:  0.1  MSE:  0.14505826687576334


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


KK (First-method) level:  0.1  MSE:  2.3862851103868477


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


KK (Second-method) level:  0.1  slope:  0.1  MSE:  3.2535508475345956


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


WP (First-method) level:  0.1  MSE:  0.1415505565574646


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


WP (Second-method) level:  0.1  slope:  0.1  MSE:  0.18184014496130543


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SLG (First-method) level:  0.1  MSE:  0.9524389705648603


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SLG (Second-method) level:  0.1  slope:  0.1  MSE:  1.3347627539001086


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BABIP (First-method) level:  0.1  MSE:  0.07565887958451792


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BABIP (Second-method) level:  0.1  slope:  0.1  MSE:  0.10274578844693008


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


KK9 (First-method) level:  0.1  MSE:  48.15377500767744


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


KK9 (Second-method) level:  0.1  slope:  0.1  MSE:  65.41497915687893


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BB9 (First-method) level:  0.1  MSE:  86.96962899041817


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BB9 (Second-method) level:  0.1  slope:  0.1  MSE:  113.84911690468482


## 2) 타자: HIT, AVG제외

In [7]:
# train data가 10이하인 case 제거하고 temp 데이터프레임 생성
temp = batter.head(1)
for i in batter.P_ID.unique():
    df = batter[(batter['P_ID']==i)&(batter['GDAY_DS'].dt.year<2020)]
    train_num = int(round(df.count()[1]*0.7))
    if train_num>=10:
         temp = temp.append(batter[batter['P_ID']==i])

col = np.array(temp.drop(['GDAY_DS','T_ID','P_ID','TB_SC','HIT','AVG'],axis=1).columns) #우선 명목형 변수 제거함..

In [8]:
for col_name in col:
    x_variable(col_name)
    x_variable2(col_name)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


PA-AB (First-method) level:  0.1  MSE:  0.29036450498145083


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


PA-AB (Second-method) level:  0.1  slope:  0.1  MSE:  0.8313766779204574


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


AB (First-method) level:  0.1  MSE:  1.8450612565534208


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


AB (Second-method) level:  0.1  slope:  0.1  MSE:  5.158886004027678


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


RUN (First-method) level:  0.1  MSE:  0.3042191989514666


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


RUN (Second-method) level:  0.1  slope:  0.1  MSE:  1.2165987633374318


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


RBI (First-method) level:  0.1  MSE:  0.4095104150085593


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


RBI (Second-method) level:  0.1  slope:  0.1  MSE:  1.336215977277075


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SH+SF (First-method) level:  0.1  MSE:  0.043829788213288146


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SH+SF (Second-method) level:  0.1  slope:  0.1  MSE:  0.11679067566896836


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


KK (First-method) level:  0.1  MSE:  0.48843345318145764


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


KK (Second-method) level:  0.1  slope:  0.1  MSE:  1.2267120146812007


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SB_trial (First-method) level:  0.1  MSE:  0.059876141672401434


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


SB_trial (Second-method) level:  0.1  slope:  0.1  MSE:  0.16763889465169585


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BABIP (First-method) level:  0.1  MSE:  0.09537769951153233


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


BABIP (Second-method) level:  0.1  slope:  0.1  MSE:  0.2332503091823532


### 모든 x변수에 대해 Simple Exponential smoothing(smoothing level=0.1) 모델을 사용할 때 예측력이 가장 높게 나왔다