In [1]:
import random
import pickle

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# models
import sklearn
import sktime

# model selection and metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

# our functions
from data_preparation import *
from models import *

# constants
PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']
TEST_SPLIT_DATE = '2021-04-30'

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

# Loading and preparing data

## Targets df

In [2]:
%%time
df = pd.read_pickle(PROCESSED_DATA_PATH + 'shifted_targets.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2695788 entries, 0 to 2695787
Data columns (total 43 columns):
 #   Column            Dtype         
---  ------            -----         
 0   Dt                datetime64[ns]
 1   IdPlayer          int64         
 2   target1           float32       
 3   target2           float32       
 4   target3           float32       
 5   target4           float32       
 6   IdDtPlayer        object        
 7   target1_shift_1   float32       
 8   target2_shift_1   float32       
 9   target3_shift_1   float32       
 10  target4_shift_1   float32       
 11  target1_shift_2   float32       
 12  target2_shift_2   float32       
 13  target3_shift_2   float32       
 14  target4_shift_2   float32       
 15  target1_shift_3   float32       
 16  target2_shift_3   float32       
 17  target3_shift_3   float32       
 18  target4_shift_3   float32       
 19  target1_shift_4   float32       
 20  target2_shift_4   float32       
 21  target3_

In [3]:
# dropping the columns with a high shift
for i in range(4):
    for shift in [6, 7, 14, 30]:
        df.drop(f'target{i+1}_shift_{shift}', axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2695788 entries, 0 to 2695787
Data columns (total 27 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Dt               datetime64[ns]
 1   IdPlayer         int64         
 2   target1          float32       
 3   target2          float32       
 4   target3          float32       
 5   target4          float32       
 6   IdDtPlayer       object        
 7   target1_shift_1  float32       
 8   target2_shift_1  float32       
 9   target3_shift_1  float32       
 10  target4_shift_1  float32       
 11  target1_shift_2  float32       
 12  target2_shift_2  float32       
 13  target3_shift_2  float32       
 14  target4_shift_2  float32       
 15  target1_shift_3  float32       
 16  target2_shift_3  float32       
 17  target3_shift_3  float32       
 18  target4_shift_3  float32       
 19  target1_shift_4  float32       
 20  target2_shift_4  float32       
 21  target3_shift_4  float32       

In [4]:
df.shape

(2695788, 27)

In [5]:
df = df.drop_duplicates(subset=['IdDtPlayer'])
df.shape

(2695788, 27)

In [6]:
len(df.IdDtPlayer.unique())

2695788

In [7]:
%%time
def add_hour_to_id(df):
    df['IdDtPlayer'] = df['IdDtPlayer'].apply(lambda x: x[:10] + ' 00:00:00' + x[10:])

df_playerBoxScores = pd.read_pickle(PROCESSED_DATA_PATH + 'playerBoxScores.pkl')
add_hour_to_id(df_playerBoxScores)
df_playerBoxScores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219727 entries, 0 to 451
Data columns (total 54 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   IdGame                    219727 non-null  int64  
 1   DtGame                    219727 non-null  object 
 2   DtGameUTC                 219727 non-null  object 
 3   IdPlayer                  219727 non-null  int64  
 4   IdTeam                    219727 non-null  int64  
 5   NuJersey                  219690 non-null  object 
 6   CdPosition                219727 non-null  int64  
 7   NuStrikeOutsPitching      65466 non-null   float64
 8   NuBattingOrder            183390 non-null  float64
 9   NuGamesPlayedBatting      183395 non-null  float64
 10  NuFlyOuts                 183395 non-null  float64
 11  NuGroundOuts              183395 non-null  float64
 12  NuRunsScored              183395 non-null  float64
 13  NuDoubles                 183395 non-null  floa

In [8]:
df_join = pd.merge(df, df_playerBoxScores, on=['IdDtPlayer'], how='inner')
df_join.head()

Unnamed: 0,Dt,IdPlayer_x,target1,target2,target3,target4,IdDtPlayer,target1_shift_1,target2_shift_1,target3_shift_1,...,NuWildPitches,NuInheritedRunners,NuInheritedRunnersScored,NuSaves,NuHolds,NuBlownSaves,NuAssists,NuPutOuts,NuErrors,NuChances
0,2018-04-02,112526,5.957846,22.427931,33.900803,38.857937,2018-04-02 00:00:00_112526,0.7711,63.601677,7.566316,...,0.0,0.0,0.0,,,,1.0,0.0,0.0,1.0
1,2018-04-08,112526,1.11831,4.124211,8.492779,5.995018,2018-04-08 00:00:00_112526,0.231458,4.020406,0.346467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,2018-04-10,112526,24.790358,15.784496,37.622959,47.137791,2018-04-10 00:00:00_112526,4.722527,11.644717,15.383535,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0
3,2018-04-15,112526,61.485676,69.971931,100.0,100.0,2018-04-15 00:00:00_112526,1.967419,100.0,32.216934,...,0.0,0.0,0.0,,,,1.0,0.0,0.0,1.0
4,2018-04-21,112526,7.213037,17.952398,71.142159,22.698462,2018-04-21 00:00:00_112526,0.108503,26.812336,1.090182,...,0.0,0.0,0.0,,,,0.0,2.0,0.0,2.0


In [9]:
%%time
df_join.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218800 entries, 0 to 218799
Data columns (total 80 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   Dt                        218800 non-null  datetime64[ns]
 1   IdPlayer_x                218800 non-null  int64         
 2   target1                   218800 non-null  float32       
 3   target2                   218800 non-null  float32       
 4   target3                   218800 non-null  float32       
 5   target4                   218800 non-null  float32       
 6   IdDtPlayer                218800 non-null  object        
 7   target1_shift_1           218800 non-null  float32       
 8   target2_shift_1           218800 non-null  float32       
 9   target3_shift_1           218800 non-null  float32       
 10  target4_shift_1           218800 non-null  float32       
 11  target1_shift_2           218800 non-null  float32       
 12  ta

In [10]:
df_join = df_join.fillna(0)

In [11]:
del df
df = df_join.copy()
del df_join
del df_playerBoxScores

In [12]:
df

Unnamed: 0,Dt,IdPlayer_x,target1,target2,target3,target4,IdDtPlayer,target1_shift_1,target2_shift_1,target3_shift_1,...,NuWildPitches,NuInheritedRunners,NuInheritedRunnersScored,NuSaves,NuHolds,NuBlownSaves,NuAssists,NuPutOuts,NuErrors,NuChances
0,2018-04-02,112526,5.957846,22.427931,33.900803,38.857937,2018-04-02 00:00:00_112526,0.771100,63.601677,7.566316,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,2018-04-08,112526,1.118310,4.124211,8.492779,5.995018,2018-04-08 00:00:00_112526,0.231458,4.020406,0.346467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,2018-04-10,112526,24.790358,15.784496,37.622959,47.137791,2018-04-10 00:00:00_112526,4.722527,11.644717,15.383535,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0
3,2018-04-15,112526,61.485676,69.971931,100.000000,100.000000,2018-04-15 00:00:00_112526,1.967419,100.000000,32.216934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,2018-04-21,112526,7.213037,17.952398,71.142159,22.698462,2018-04-21 00:00:00_112526,0.108503,26.812336,1.090182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218795,2021-04-14,685503,3.056950,4.553284,0.023007,1.441628,2021-04-14 00:00:00_685503,0.549280,17.437157,0.085512,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
218796,2021-04-19,685503,15.077294,23.880243,7.940695,8.382304,2021-04-19 00:00:00_685503,5.914783,89.426735,0.052608,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
218797,2021-04-25,685503,0.075090,1.664232,0.015038,1.055253,2021-04-25 00:00:00_685503,0.068494,15.961515,0.043568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
218798,2021-04-30,685503,0.014727,6.607196,1.851336,2.014339,2021-04-30 00:00:00_685503,0.018770,31.946022,0.305491,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df.rename(columns={'IdPlayer_x': 'IdPlayer'}, inplace=True)

In [14]:
sort_df(df)

## Train Test Split

In [15]:
%%time
train, test = train_test_split(df)
x_train, y_train = x_y_split(train)
x_test, y_test = x_y_split(test)
print(f'Train shape: {x_train.shape}, Test shape: {x_test.shape}')
del df

Train shape: (184759, 76), Test shape: (34041, 76)
CPU times: user 297 ms, sys: 42.5 ms, total: 340 ms
Wall time: 337 ms


# Models

In [16]:
evals_df = pd.DataFrame()

## Baseline

In [17]:
# Naive
y_pred_naive = naive(test)
evals_df = evals_df.append({'model': 'naive', **evaluate_mae(y_test, y_pred_naive)}, ignore_index=True)
del y_pred_naive
# Mean
# mean = MeanModel()
# mean.fit(train)
# y_pred_mean = mean.predict(test)
# evals_df = evals_df.append({'model': 'mean', **evaluate_mae(y_test, y_pred_mean)}, ignore_index=True)
# del mean, y_pred_mean


## LASSO

In [18]:
from sklearn.linear_model import Lasso

In [19]:
%%time
# adjust the datetime to int
x_train['Dt_int'] = train['Dt'].apply(lambda x: int(x.strftime("%Y%m%d%H%M%S")))
x_test['Dt_int'] = test['Dt'].apply(lambda x: int(x.strftime("%Y%m%d%H%M%S")))
x_train.drop(['Dt', 'IdDtPlayer', 'DtGame', 'DtGameUTC', 'NuJersey'], axis=1, inplace=True)
x_test.drop(['Dt', 'IdDtPlayer', 'DtGame', 'DtGameUTC', 'NuJersey'], axis=1, inplace=True)
del train, test

CPU times: user 2.17 s, sys: 39.9 ms, total: 2.2 s
Wall time: 2.2 s


In [21]:
lasso = Lasso(alpha=0.1)
rfe_lasso = RFE(lasso, n_features_to_select=15)

In [22]:
%%time
y_pred_lasso, rfe_lasso_models = fit_predict_targets(rfe_lasso, x_train, y_train, x_test, return_models=True)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

CPU times: user 2h 25min 17s, sys: 1h 27min 49s, total: 3h 53min 7s
Wall time: 34min 16s


In [32]:
y_pred_lasso = pd.DataFrame()
for i in range(4):
    y_pred_lasso[TARGET_COLS[i]] = rfe_lasso_models[i].predict(x_test)
y_pred_lasso.head()

Unnamed: 0,target1,target2,target3,target4
0,21.522008,21.522008,21.522008,21.522008
1,23.258569,23.258569,23.258569,23.258569
2,28.403469,28.403469,28.403469,28.403469
3,24.84603,24.84603,24.84603,24.84603
4,67.793019,67.793019,67.793019,67.793019


In [34]:
evals_df = evals_df.append({'model': 'lasso', **evaluate_mae(y_test, y_pred_lasso)}, ignore_index=True)
del y_pred_lasso
evals_df

Unnamed: 0,model,target1,target2,target3,target4
0,naive,3.337228,3.975692,1.933233,1.973346
1,lasso,3.017863,2.520866,2.382316,1.665532
2,lasso,3.532575,2.723825,3.14154,1.665532


TypeError: 'int' object is not iterable

In [None]:
y_pred_lasso = pd.DataFrame(rfe_lasso.predict(x_test), columns=TARGET_COLS)
evals_df = evals_df.append({'model': 'lasso', **evaluate_mae(y_test, y_pred_lasso)}, ignore_index=True)