In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import gc
from functools import reduce
from sklearn.model_selection import StratifiedKFold

In [2]:
def make_df(df, col, bool_in=False):
    tp = df.loc[ ~df[col].isnull() ,[col]].copy()
    df.drop(col, axis=1, inplace=True)
    
    tp[col] = tp[col].str.replace("null",'""')
    if bool_in:
        tp[col] = tp[col].str.replace("false",'"False"')
        tp[col] = tp[col].str.replace("true",'"True"')
    tp[col] = tp[col].apply(lambda x: eval(x) )
    a = tp[col].sum()
    gc.collect()
    return pd.DataFrame(a)
#===============

In [3]:
ROOT_DIR = "../input/mlb-player-digital-engagement-forecasting"

In [4]:
my_seed = 2021
np.random.seed(my_seed)
import random 
random.seed(my_seed)
import tensorflow as tf
tf.random.set_seed(my_seed)

## UTILITY FUNCTIONS

In [5]:
#=======================#
def flatten(df, col):
    du = (df.pivot(index="playerId", columns="EvalDate", 
               values=col).add_prefix(f"{col}_").
      rename_axis(None, axis=1).reset_index())
    return du
#============================#
def reducer(left, right):
    return left.merge(right, on="playerId")
#========================

In [6]:
TGTCOLS = ["target1","target2","target3","target4"]
def train_lag(df, lag=1):
    dp = df[["playerId","EvalDate"]+TGTCOLS].copy()
    dp["EvalDate"]  =dp["EvalDate"] + timedelta(days=lag) 
    df = df.merge(dp, on=["playerId", "EvalDate"], suffixes=["",f"_{lag}"], how="left")
    return df
#=================================
def test_lag(sub):
    sub["playerId"] = sub["date_playerId"].apply(lambda s: int(  s.split("_")[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub["date"].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format="%Y%m%d")
    dtes = [eval_dt + timedelta(days=-k) for k in LAGS]
    mp_dtes = {eval_dt + timedelta(days=-k):k for k in LAGS}
    
    sl = LAST.loc[LAST.EvalDate.between(dtes[-1], dtes[0]), ["EvalDate","playerId"]+TGTCOLS].copy()
    sl["EvalDate"] = sl["EvalDate"].map(mp_dtes)
    du = [flatten(sl, col) for col in TGTCOLS]
    du = reduce(reducer, du)
    return du, eval_dt
    #
#===============

In [7]:
%%time
#tr = pd.read_csv(f"{ROOT_DIR}/train.csv")
tr = pd.read_csv("../input/my-mlb-data/target.csv")
print(tr.shape)
gc.collect()

(2506176, 6)
CPU times: user 2.56 s, sys: 270 ms, total: 2.83 s
Wall time: 4.74 s


22

In [8]:
tr["EvalDate"] = pd.to_datetime(tr["EvalDate"])
tr["EvalDate"] = tr["EvalDate"] + timedelta(days=-1)
tr["EvalYear"] = tr["EvalDate"].dt.year

In [9]:
MED_DF = tr.groupby(["playerId","EvalYear"])[TGTCOLS].median().reset_index()
MEDCOLS = ["tgt1_med","tgt2_med", "tgt3_med", "tgt4_med"]
MED_DF.columns = ["playerId","EvalYear"] + MEDCOLS

In [10]:
MED_DF.head()

Unnamed: 0,playerId,EvalYear,tgt1_med,tgt2_med,tgt3_med,tgt4_med
0,112526,2018,0.151508,4.901809,0.528752,13.437293
1,112526,2019,0.033293,1.117953,0.129707,10.568848
2,112526,2020,0.021525,1.468556,0.041698,8.448668
3,112526,2021,0.007505,0.477795,0.043267,9.549732
4,134181,2018,0.706118,5.399749,0.733436,6.923528


In [11]:
LAGS = list(range(1,21))
FECOLS = [f"{col}_{lag}" for lag in reversed(LAGS) for col in TGTCOLS]

In [12]:
LAGS

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [13]:
%%time
for lag in tqdm(LAGS):
    tr = train_lag(tr, lag=lag)
    gc.collect()
#===========
tr = tr.sort_values(by=["playerId", "EvalDate"])
print(tr.shape)
tr = tr.dropna()
print(tr.shape)
tr = tr.merge(MED_DF, on=["playerId","EvalYear"])
gc.collect()

100%|██████████| 20/20 [00:55<00:00,  2.79s/it]


(2506176, 87)
(2464956, 87)
CPU times: user 45.6 s, sys: 19.3 s, total: 1min 4s
Wall time: 1min 4s


39

In [14]:
tr.head(1)

Unnamed: 0,playerId,target1,target2,target3,target4,EvalDate,EvalYear,target1_1,target2_1,target3_1,...,target3_19,target4_19,target1_20,target2_20,target3_20,target4_20,tgt1_med,tgt2_med,tgt3_med,tgt4_med
0,112526,0.311895,5.163646,0.277666,15.897436,2018-01-21,2018,0.070373,31.732933,0.444872,...,0.030486,8.541353,0.055277,5.496109,0.025839,16.176471,0.151508,4.901809,0.528752,13.437293


In [15]:
X = tr[FECOLS+MEDCOLS].values
y = tr[TGTCOLS].values
cl = tr["playerId"].values

In [16]:
NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS)
folds = skf.split(X, cl)
folds = list(folds)

In [17]:
X.shape

(2464956, 84)

## Neural Net Training

In [18]:
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [19]:
def make_model(n_in):
    inp = L.Input(name="inputs", shape=(n_in,))
    x = L.Dense(50, activation="relu", name="d1")(inp)
    x = L.Dense(50, activation="relu", name="d2")(x)
    preds = L.Dense(4, activation="linear", name="preds")(x)
    
    model = M.Model(inp, preds, name="ANN")
    model.compile(loss="mean_absolute_error", optimizer="adam")
    return model

In [20]:
net = make_model(X.shape[1])
print(net.summary())

Model: "ANN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 84)]              0         
_________________________________________________________________
d1 (Dense)                   (None, 50)                4250      
_________________________________________________________________
d2 (Dense)                   (None, 50)                2550      
_________________________________________________________________
preds (Dense)                (None, 4)                 204       
Total params: 7,004
Trainable params: 7,004
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
oof = np.zeros(y.shape)
nets = []
for idx in range(NFOLDS):
    print("FOLD:", idx)
    tr_idx, val_idx = folds[idx]
    ckpt = ModelCheckpoint(f"w{idx}.h5", monitor='val_loss', verbose=1, save_best_only=True,mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=3, min_lr=0.0001)
    es = EarlyStopping(monitor='val_loss', patience=5)
    reg = make_model(X.shape[1])
    reg.fit(X[tr_idx], y[tr_idx], epochs=10, batch_size=30_000, validation_data=(X[val_idx], y[val_idx]),
            verbose=1, callbacks=[ckpt, reduce_lr, es])
    reg.load_weights(f"w{idx}.h5")
    oof[val_idx] = reg.predict(X[val_idx], batch_size=50_000, verbose=1)
    nets.append(reg)
    gc.collect()

FOLD: 0
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.03344, saving model to w0.h5
Epoch 2/10

Epoch 00002: val_loss improved from 1.03344 to 0.95912, saving model to w0.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.95912 to 0.92367, saving model to w0.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.92367 to 0.90214, saving model to w0.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.90214 to 0.88901, saving model to w0.h5
Epoch 6/10

Epoch 00006: val_loss improved from 0.88901 to 0.87889, saving model to w0.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.87889 to 0.87281, saving model to w0.h5
Epoch 8/10

Epoch 00008: val_loss improved from 0.87281 to 0.86807, saving model to w0.h5
Epoch 9/10

Epoch 00009: val_loss improved from 0.86807 to 0.86428, saving model to w0.h5
Epoch 10/10

Epoch 00010: val_loss improved from 0.86428 to 0.86285, saving model to w0.h5
FOLD: 1
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.84754, saving model to w1.h5
E

In [22]:
mae = mean_absolute_error(y, oof)
mse = mean_squared_error(y, oof, squared=False)
print("mae:", mae)
print("mse:", mse)

mae: 0.7727517316297969
mse: 3.9314386784209567


In [23]:
# Historical information to use in prediction time
bound_dt = pd.to_datetime("2021-01-01")
LAST = tr.loc[tr.EvalDate>bound_dt].copy()

In [24]:
LAST_MED_DF = MED_DF.loc[MED_DF.EvalYear==2021].copy()
LAST_MED_DF.drop("EvalYear", axis=1, inplace=True)
del tr

In [25]:
LAST.shape, LAST_MED_DF.shape, MED_DF.shape

((245259, 91), (2061, 5), (8244, 6))

In [26]:
import mlb
FE = []; SUB = [];
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sub) in iter_test:
    # Features computation at Evaluation Date
    sub = sub.reset_index()
    sub_fe, eval_dt = test_lag(sub)
    sub_fe = sub_fe.merge(LAST_MED_DF, on="playerId", how="left")
    sub_fe = sub_fe.fillna(0.)
    
    _preds = 0.
    for reg in nets:
        _preds += reg.predict(sub_fe[FECOLS + MEDCOLS]) / NFOLDS
    sub_fe[TGTCOLS] = np.clip(_preds, 0, 100)
    sub.drop(["date"]+TGTCOLS, axis=1, inplace=True)
    sub = sub.merge(sub_fe[["playerId"]+TGTCOLS], on="playerId", how="left")
    sub.drop("playerId", axis=1, inplace=True)
    sub = sub.fillna(0.)
    # Submit
    env.predict(sub)
    # Update Available information
    sub_fe["EvalDate"] = eval_dt
    #sub_fe.drop(MEDCOLS, axis=1, inplace=True)
    LAST = LAST.append(sub_fe)
    LAST = LAST.drop_duplicates(subset=["EvalDate","playerId"], keep="last")

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
