In [1]:
import random
import pickle

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# models
import sklearn
import sktime

# model selection and metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

# our functions
from data_preparation import *
from models import *

# constants
PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']
TEST_SPLIT_DATE = '2021-04-30'

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

# Loading and preparing data

## Targets df

In [2]:
%%time
df = pd.read_pickle(PROCESSED_DATA_PATH + 'shifted_targets.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2695788 entries, 0 to 2695787
Data columns (total 43 columns):
 #   Column            Dtype         
---  ------            -----         
 0   Dt                datetime64[ns]
 1   IdPlayer          int64         
 2   target1           float32       
 3   target2           float32       
 4   target3           float32       
 5   target4           float32       
 6   IdDtPlayer        object        
 7   target1_shift_1   float32       
 8   target2_shift_1   float32       
 9   target3_shift_1   float32       
 10  target4_shift_1   float32       
 11  target1_shift_2   float32       
 12  target2_shift_2   float32       
 13  target3_shift_2   float32       
 14  target4_shift_2   float32       
 15  target1_shift_3   float32       
 16  target2_shift_3   float32       
 17  target3_shift_3   float32       
 18  target4_shift_3   float32       
 19  target1_shift_4   float32       
 20  target2_shift_4   float32       
 21  target3_

In [3]:
# dropping the columns with a high shift
for i in range(4):
    for shift in [6, 7, 14, 30]:
        df.drop(f'target{i+1}_shift_{shift}', axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2695788 entries, 0 to 2695787
Data columns (total 27 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Dt               datetime64[ns]
 1   IdPlayer         int64         
 2   target1          float32       
 3   target2          float32       
 4   target3          float32       
 5   target4          float32       
 6   IdDtPlayer       object        
 7   target1_shift_1  float32       
 8   target2_shift_1  float32       
 9   target3_shift_1  float32       
 10  target4_shift_1  float32       
 11  target1_shift_2  float32       
 12  target2_shift_2  float32       
 13  target3_shift_2  float32       
 14  target4_shift_2  float32       
 15  target1_shift_3  float32       
 16  target2_shift_3  float32       
 17  target3_shift_3  float32       
 18  target4_shift_3  float32       
 19  target1_shift_4  float32       
 20  target2_shift_4  float32       
 21  target3_shift_4  float32       

In [4]:
df.shape

(2695788, 27)

In [5]:
df = df.drop_duplicates(subset=['IdDtPlayer'])
df.shape

(2695788, 27)

In [6]:
len(df.IdDtPlayer.unique())

2695788

In [7]:
%%time
def add_hour_to_id(df):
    df['IdDtPlayer'] = df['IdDtPlayer'].apply(lambda x: x[:10] + ' 00:00:00' + x[10:])

df_playerBoxScores = pd.read_pickle(PROCESSED_DATA_PATH + 'playerBoxScores.pkl')
add_hour_to_id(df_playerBoxScores)
df_playerBoxScores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219727 entries, 0 to 451
Data columns (total 54 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   IdGame                    219727 non-null  int64  
 1   DtGame                    219727 non-null  object 
 2   DtGameUTC                 219727 non-null  object 
 3   IdPlayer                  219727 non-null  int64  
 4   IdTeam                    219727 non-null  int64  
 5   NuJersey                  219690 non-null  object 
 6   CdPosition                219727 non-null  int64  
 7   NuStrikeOutsPitching      65466 non-null   float64
 8   NuBattingOrder            183390 non-null  float64
 9   NuGamesPlayedBatting      183395 non-null  float64
 10  NuFlyOuts                 183395 non-null  float64
 11  NuGroundOuts              183395 non-null  float64
 12  NuRunsScored              183395 non-null  float64
 13  NuDoubles                 183395 non-null  floa

In [8]:
df_join = pd.merge(df, df_playerBoxScores, on=['IdDtPlayer'], how='left')
df_join.head()

Unnamed: 0,Dt,IdPlayer_x,target1,target2,target3,target4,IdDtPlayer,target1_shift_1,target2_shift_1,target3_shift_1,...,NuWildPitches,NuInheritedRunners,NuInheritedRunnersScored,NuSaves,NuHolds,NuBlownSaves,NuAssists,NuPutOuts,NuErrors,NuChances
0,2018-01-01,112526,0.055277,5.496109,0.025839,16.17647,2018-01-01 00:00:00_112526,,,,...,,,,,,,,,,
1,2018-01-02,112526,0.060625,3.252914,0.030486,8.541353,2018-01-02 00:00:00_112526,0.055277,5.496109,0.025839,...,,,,,,,,,,
2,2018-01-03,112526,0.029341,1.648352,0.032613,10.490111,2018-01-03 00:00:00_112526,0.060625,3.252914,0.030486,...,,,,,,,,,,
3,2018-01-04,112526,0.014799,2.665894,0.087422,19.091467,2018-01-04 00:00:00_112526,0.029341,1.648352,0.032613,...,,,,,,,,,,
4,2018-01-05,112526,0.083916,1.161002,0.024759,6.643879,2018-01-05 00:00:00_112526,0.014799,2.665894,0.087422,...,,,,,,,,,,


In [9]:
%%time
df_join.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698457 entries, 0 to 2698456
Data columns (total 80 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   Dt                        2698457 non-null  datetime64[ns]
 1   IdPlayer_x                2698457 non-null  int64         
 2   target1                   2698457 non-null  float32       
 3   target2                   2698457 non-null  float32       
 4   target3                   2698457 non-null  float32       
 5   target4                   2698457 non-null  float32       
 6   IdDtPlayer                2698457 non-null  object        
 7   target1_shift_1           2696396 non-null  float32       
 8   target2_shift_1           2696396 non-null  float32       
 9   target3_shift_1           2696396 non-null  float32       
 10  target4_shift_1           2696396 non-null  float32       
 11  target1_shift_2           2694335 non-null  float3

In [10]:
del df
df = df_join.copy()
del df_join
del df_playerBoxScores

In [11]:
df.rename(columns={'IdPlayer_x': 'IdPlayer'}, inplace=True)

## Train Test Split

In [12]:
%%time
train, test = train_test_split(df)
x_train, y_train = x_y_split(train)
x_test, y_test = x_y_split(test)
print(f'Train shape: {x_train.shape}, Test shape: {x_test.shape}')
del df

Train shape: (2508359, 76), Test shape: (190098, 76)
CPU times: user 2.32 s, sys: 7.5 s, total: 9.82 s
Wall time: 9.85 s


# Models

In [16]:
y_pred_naive = naive(test)
mean = MeanModel()
mean.fit(train)
y_pred_mean = mean.predict(test)
del mean

In [17]:
evals_df = pd.DataFrame(columns=['model', 'target1', 'target2', 'target3', 'target4'])
evals_df = evals_df.append({'model': 'naive', **evaluate_mae(y_test, y_pred_naive)}, ignore_index=True)
evals_df = evals_df.append({'model': 'mean', **evaluate_mae(y_test, y_pred_mean)}, ignore_index=True)
evals_df

Unnamed: 0,model,target1,target2,target3,target4
0,naive,0.805882,1.354502,0.63182,0.81206
1,mean,0.951045,2.255654,0.960717,1.028092
