# Multitask Models

Testa modelos de multitasking

## Configuração de Ambiente

In [81]:
# Variáveis de Configuração

# Caminhos de arquivo
raw_data_path = '../data/raw-data/'                 # dados não processados
raw_dataset_path = raw_data_path + 'train_updated.csv'
processed_data_path = '../data/processed-data/'     # dados processados
processed_dataset_path = processed_data_path + 'train.pkl' 
model_path = '../models/trained-models/'

In [82]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

from matplotlib import pyplot as plt

from models import *

from sklearn.metrics import mean_absolute_error

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
warnings.filterwarnings("ignore")

In [83]:
# Variáveis/Constantes Globais
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

In [84]:
# Carregando o dataframe
df_train = pd.read_pickle(processed_dataset_path)
df_train.head()

Unnamed: 0,CdDayNight_day,CdDayNight_night,CdDoubleHeader_N,CdDoubleHeader_S,CdDoubleHeader_Y,CdGameState_Completed Early,CdGameState_F,CdGameState_Final,CdGameType_A,CdGameType_D,...,target4_shift_2,target4_shift_3,target4_shift_4,target4_shift_5,target4_shift_6,target4_shift_7,target1,target2,target3,target4
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.148438,6.644531,19.09375,10.492188,8.539062,16.171875,0.032364,4.396423,0.039198,17.619047
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16.171875,12.148438,6.644531,19.09375,10.492188,8.539062,0.016104,1.791045,0.055419,4.627697
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17.625,16.171875,12.148438,6.644531,19.09375,10.492188,0.075636,1.866901,0.023244,8.506344
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.628906,17.625,16.171875,12.148438,6.644531,19.09375,0.034998,2.284644,0.020102,7.942532
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.507812,4.628906,17.625,16.171875,12.148438,6.644531,0.018201,0.623816,0.007931,4.924134


## Preparação dos Dados

Manter no notebook apenas a preparação pontual dos dados para a aplicação em modelos, toda preparação permanente deve ser adicionada em `/src/data-engineering/data-engineering.ipynb` e então salvo o dataset processado em `/src/data/processed-data/`.

### Funções Auxiliares

Apenas manter aqui funções que serão utilizadas pontualmente no notebook, em caso de funções que serão usadas em outros notebooks, deve-se transferir as mesmas para um arquivo `.py` separado e importar as funções para o notebook.

In [85]:
# Funções Auxiliares

# Imports
from data_preparation import *

# Funções Auxiliares


### Preparação

In [86]:
with open('features.pkl', 'rb') as f:
    selected_features = pkl.load(f)
df_train = df_train[np.append(np.append(TARGET_COLS, selected_features), 'IdPlayer')]
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2693260 entries, 0 to 2693259
Data columns (total 47 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   target1                  float32       
 1   target2                  float32       
 2   target3                  float32       
 3   target4                  float32       
 4   target1_shift_1          float16       
 5   target2_shift_1          float16       
 6   target3_shift_1          float16       
 7   target4_shift_1          float16       
 8   target1_shift_2          float16       
 9   target4_shift_2          float16       
 10  target1_shift_3          float16       
 11  target1_shift_4          float16       
 12  target1_shift_5          float16       
 13  NuHomeRuns_Player        float16       
 14  NuTotalBases_Player      float16       
 15  NuRbi_Player             float16       
 16  CdDayNight_night         float16       
 17  target2_shift_1          fl

### Train Test Split

In [87]:
train, test, val = train_test_split(df_train)

In [88]:
train['Dt'] = pd.to_numeric(pd.to_datetime(train['Dt']))
test['Dt']= pd.to_numeric(pd.to_datetime(test['Dt']))
val['Dt'] = pd.to_numeric(pd.to_datetime(val['Dt']))

## Treinando Modelos

In [89]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.multioutput import MultiOutputRegressor

df_results = pd.DataFrame(columns = ['model', 'target1 | AMAE', 'target2 | AMAE', 'target3 | AMAE', 'target4 | AMAE', 'average | AMAE', 'average | MAE'])

In [90]:
train['Dt']

0          1515369600000000000
1          1515456000000000000
2          1515542400000000000
3          1515628800000000000
4          1515715200000000000
                  ...         
2693074    1611705600000000000
2693075    1611792000000000000
2693076    1611878400000000000
2693077    1611964800000000000
2693078    1612051200000000000
Name: Dt, Length: 2317282, dtype: int64

### Baselines

In [91]:
# naive
naive = train[['target1_shift_1', 'target2_shift_1', 'target3_shift_1', 'target4_shift_1', 'IdPlayer']].set_index('IdPlayer')

# media
media = train[TARGET_COLS].mean()
media_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].mean()

# mediana
mediana = train[TARGET_COLS].median()
mediana_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].median()

In [76]:
naive

Unnamed: 0_level_0,target1_shift_1,target2_shift_1,target2_shift_1,target2_shift_1,target3_shift_1,target3_shift_1,target4_shift_1,target4_shift_1,target4_shift_1,target4_shift_1
IdPlayer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
112526,0.014557,4.714844,4.714844,4.714844,0.026337,0.026337,16.171875,16.171875,16.171875,16.171875
112526,0.032349,4.394531,4.394531,4.394531,0.039185,0.039185,17.625000,17.625000,17.625000,17.625000
112526,0.016098,1.791016,1.791016,1.791016,0.055420,0.055420,4.628906,4.628906,4.628906,4.628906
112526,0.075623,1.867188,1.867188,1.867188,0.023239,0.023239,8.507812,8.507812,8.507812,8.507812
112526,0.035004,2.285156,2.285156,2.285156,0.020096,0.020096,7.941406,7.941406,7.941406,7.941406
...,...,...,...,...,...,...,...,...,...,...
685503,0.000000,0.493652,0.493652,0.493652,0.000000,0.000000,0.607910,0.607910,0.607910,0.607910
685503,0.000000,0.520996,0.520996,0.520996,0.000000,0.000000,0.313232,0.313232,0.313232,0.313232
685503,0.000000,0.426514,0.426514,0.426514,0.000000,0.000000,0.185181,0.185181,0.185181,0.185181
685503,0.000000,0.604980,0.604980,0.604980,0.000000,0.000000,0.182617,0.182617,0.182617,0.182617


In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2317282 entries, 0 to 2693078
Data columns (total 47 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   target1                  float32
 1   target2                  float32
 2   target3                  float32
 3   target4                  float32
 4   target1_shift_1          float16
 5   target2_shift_1          float16
 6   target3_shift_1          float16
 7   target4_shift_1          float16
 8   target1_shift_2          float16
 9   target4_shift_2          float16
 10  target1_shift_3          float16
 11  target1_shift_4          float16
 12  target1_shift_5          float16
 13  NuHomeRuns_Player        float16
 14  NuTotalBases_Player      float16
 15  NuRbi_Player             float16
 16  CdDayNight_night         float16
 17  target2_shift_1          float16
 18  target4_shift_1          float16
 19  target2_shift_3          float16
 20  target2_shift_4          float16
 21  target2_

In [18]:
%%time
summary = pd.DataFrame()
temp = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
   
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
  
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    temp.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
    temp.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
    temp.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
    temp.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
    temp.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    summary.loc['Média',target + " | AMAE"]  =  AMAE(y_true,media_pred,show = False)
    summary.loc['Média por Jogador',target + " | AMAE"]  =  AMAE(y_true,mediapj_pred,show = False)
    summary.loc['Mediana',target + " | AMAE"]  =  AMAE(y_true,mediana_pred,show = False)
    summary.loc['Mediana por Jogador',target + " | AMAE"]  =  AMAE(y_true,medianapj_pred,show = False)
    summary.loc['Naive',target + " | AMAE"]  =  AMAE(y_true,naive_pred,show = False)
    
summary['average | MAE'] = temp.mean(axis=1)
summary['average | AMAE'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})

df_results = df_results.append(summary, ignore_index = True)
df_results

ValueError: Input contains NaN.

### Modelos Lineares

In [13]:
df_train = df_train.drop(columns = ['IdPlayer'])

#### LASSO

In [14]:
%%time
from sklearn.linear_model import Lasso

# defines the model and create the multioutput regressor
model = Lasso(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Lasso | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

CPU times: total: 5 s
Wall time: 15min 28s


Unnamed: 0,model,target1 | AMAE,target2 | AMAE,target3 | AMAE,target4 | AMAE,average | AMAE,average | MAE
0,Média,6843.978819,6398.583724,7121.952787,6823.816072,5437.987665,1.606922
1,Média por Jogador,6414.161377,5524.151229,6761.46272,6119.308259,4964.079988,1.316354
2,Mediana,6900.77556,6587.051921,7190.445719,6912.163273,5518.29097,1.018378
3,Mediana por Jogador,6833.931392,6005.371015,7149.41096,6419.212379,5281.77421,0.945304
4,Lasso | MultiOutput,5067.568427,3949.329318,5768.912968,3629.895034,4603.926437,0.909959


#### Ridge

In [15]:
%%time
from sklearn.linear_model import Ridge

# defines the model and create the multioutput regressor
model = Ridge(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Ridge | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

CPU times: total: 4.64 s
Wall time: 12 s


Unnamed: 0,model,target1 | AMAE,target2 | AMAE,target3 | AMAE,target4 | AMAE,average | AMAE,average | MAE
0,Média,6843.978819,6398.583724,7121.952787,6823.816072,5437.987665,1.606922
1,Média por Jogador,6414.161377,5524.151229,6761.46272,6119.308259,4964.079988,1.316354
2,Mediana,6900.77556,6587.051921,7190.445719,6912.163273,5518.29097,1.018378
3,Mediana por Jogador,6833.931392,6005.371015,7149.41096,6419.212379,5281.77421,0.945304
4,Lasso | MultiOutput,5067.568427,3949.329318,5768.912968,3629.895034,4603.926437,0.909959
5,Ridge | MultiOutput,4895.380424,3919.109459,5695.993922,3599.34686,4527.457666,0.925028


In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2693260 entries, 0 to 2693259
Data columns (total 46 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   target1                  float64       
 1   target2                  float64       
 2   target3                  float64       
 3   target4                  float64       
 4   target1_shift_1          float16       
 5   target2_shift_1          float16       
 6   target3_shift_1          float16       
 7   target4_shift_1          float16       
 8   target1_shift_2          float16       
 9   target4_shift_2          float16       
 10  target1_shift_3          float16       
 11  target1_shift_4          float16       
 12  target1_shift_5          float16       
 13  NuHomeRuns_Player        float16       
 14  NuTotalBases_Player      float16       
 15  NuRbi_Player             float16       
 16  CdDayNight_night         float16       
 17  target2_shift_1          fl

#### ElasticNet

In [17]:
%%time
from sklearn.linear_model import ElasticNet

# defines the model and create the multioutput regressor
model = ElasticNet(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'ElasticNet | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)


# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

CPU times: total: 4.89 s
Wall time: 21min 11s


Unnamed: 0,model,target1 | AMAE,target2 | AMAE,target3 | AMAE,target4 | AMAE,average | AMAE,average | MAE
0,Média,6843.978819,6398.583724,7121.952787,6823.816072,5437.987665,1.606922
1,Média por Jogador,6414.161377,5524.151229,6761.46272,6119.308259,4964.079988,1.316354
2,Mediana,6900.77556,6587.051921,7190.445719,6912.163273,5518.29097,1.018378
3,Mediana por Jogador,6833.931392,6005.371015,7149.41096,6419.212379,5281.77421,0.945304
4,Lasso | MultiOutput,5067.568427,3949.329318,5768.912968,3629.895034,4603.926437,0.909959
5,Ridge | MultiOutput,4895.380424,3919.109459,5695.993922,3599.34686,4527.457666,0.925028
6,ElasticNet | MultiOutput,5039.299119,3940.109588,5749.736272,3616.46379,4586.402192,0.914777


### Tree Models

#### Gradient Boosting

In [18]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

# defines the model and create the multioutput regressor
model = GradientBoostingRegressor(random_state=RANDOM_SEED, loss = 'absolute_error')
model_name = 'GradientBoostingRegressor | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)


# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

CPU times: total: 4.88 s
Wall time: 1h 3min 36s


Unnamed: 0,model,target1 | AMAE,target2 | AMAE,target3 | AMAE,target4 | AMAE,average | AMAE,average | MAE
0,Média,6843.978819,6398.583724,7121.952787,6823.816072,5437.987665,1.606922
1,Média por Jogador,6414.161377,5524.151229,6761.46272,6119.308259,4964.079988,1.316354
2,Mediana,6900.77556,6587.051921,7190.445719,6912.163273,5518.29097,1.018378
3,Mediana por Jogador,6833.931392,6005.371015,7149.41096,6419.212379,5281.77421,0.945304
4,Lasso | MultiOutput,5067.568427,3949.329318,5768.912968,3629.895034,4603.926437,0.909959
5,Ridge | MultiOutput,4895.380424,3919.109459,5695.993922,3599.34686,4527.457666,0.925028
6,ElasticNet | MultiOutput,5039.299119,3940.109588,5749.736272,3616.46379,4586.402192,0.914777
7,GradientBoostingRegressor | MultiOutput,6115.08443,5409.870611,6609.29977,5162.907387,5824.290549,0.708919


## Resultados

In [19]:
df_results

Unnamed: 0,model,target1 | AMAE,target2 | AMAE,target3 | AMAE,target4 | AMAE,average | AMAE,average | MAE
0,Média,6843.978819,6398.583724,7121.952787,6823.816072,5437.987665,1.606922
1,Média por Jogador,6414.161377,5524.151229,6761.46272,6119.308259,4964.079988,1.316354
2,Mediana,6900.77556,6587.051921,7190.445719,6912.163273,5518.29097,1.018378
3,Mediana por Jogador,6833.931392,6005.371015,7149.41096,6419.212379,5281.77421,0.945304
4,Lasso | MultiOutput,5067.568427,3949.329318,5768.912968,3629.895034,4603.926437,0.909959
5,Ridge | MultiOutput,4895.380424,3919.109459,5695.993922,3599.34686,4527.457666,0.925028
6,ElasticNet | MultiOutput,5039.299119,3940.109588,5749.736272,3616.46379,4586.402192,0.914777
7,GradientBoostingRegressor | MultiOutput,6115.08443,5409.870611,6609.29977,5162.907387,5824.290549,0.708919


In [20]:
# for i in range(len(TARGET_COLS)):
#     plt.bar(range(len(regressor.estimators_[i].feature_importances_)), regressor.estimators_[i].feature_importances_)
#     plt.show()
# for i in range(len(TARGET_COLS)):
#     plt.bar(range(len(regressor.estimators_[i].feature_importances_)), regressor.estimators_[i].feature_importances_)


In [21]:
# selected_features = pd.array([])
# for i in range(len(TARGET_COLS)):
#     features = np.where(regressor.estimators_[i].feature_importances_ >= 0.005)
#     selected_features = np.append(selected_features, train.drop(columns=TARGET_COLS).columns[features])
#     # selected_features.extend(list(np.where(regressor.estimators_[i].feature_importances_ > 0.0001)))
# # np.where(regressor.estimators_[0].feature_importances_ > 0.0)
# len(np.unique(selected_features))

# with open('features.pkl', 'wb') as file:
#     pickle.dump(selected_features, file)
# pickle.load(open('features.pkl', 'rb'))

In [22]:
df_train

Unnamed: 0,target1,target2,target3,target4,target1_shift_1,target2_shift_1,target3_shift_1,target4_shift_1,target1_shift_2,target4_shift_2,...,DtDayOfYear,NuHomeRuns_Player,NuInningsPitched_Player,CdDayNight_night,target4_shift_1.1,target4_shift_3,target4_shift_4,target4_shift_5,target4_shift_6,target4_shift_7
0,0.032349,4.394531,0.039185,17.625000,0.014557,4.714844,0.026337,16.171875,0.020096,12.148438,...,8,0.0,0.0,0.0,16.171875,6.644531,19.093750,10.492188,8.539062,16.171875
1,0.016098,1.791016,0.055420,4.628906,0.032349,4.394531,0.039185,17.625000,0.014557,16.171875,...,9,0.0,0.0,0.0,17.625000,12.148438,6.644531,19.093750,10.492188,8.539062
2,0.075623,1.867188,0.023239,8.507812,0.016098,1.791016,0.055420,4.628906,0.032349,17.625000,...,10,0.0,0.0,0.0,4.628906,16.171875,12.148438,6.644531,19.093750,10.492188
3,0.035004,2.285156,0.020096,7.941406,0.075623,1.867188,0.023239,8.507812,0.016098,4.628906,...,11,0.0,0.0,0.0,8.507812,17.625000,16.171875,12.148438,6.644531,19.093750
4,0.018204,0.624023,0.007935,4.925781,0.035004,2.285156,0.020096,7.941406,0.075623,8.507812,...,12,0.0,0.0,0.0,7.941406,4.628906,17.625000,16.171875,12.148438,6.644531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2693255,0.000642,0.182739,0.000482,0.574707,0.000661,0.109497,0.000882,0.545410,0.000955,0.556641,...,208,0.0,0.0,0.0,0.545410,0.377441,0.177612,0.054626,0.205322,0.212036
2693256,0.000515,0.145020,0.000681,0.068176,0.000642,0.182739,0.000482,0.574707,0.000661,0.545410,...,209,0.0,0.0,0.0,0.574707,0.556641,0.377441,0.177612,0.054626,0.205322
2693257,0.000535,0.139160,0.000389,0.079956,0.000515,0.145020,0.000681,0.068176,0.000642,0.574707,...,210,0.0,0.0,0.0,0.068176,0.545410,0.556641,0.377441,0.177612,0.054626
2693258,0.000452,0.193604,0.002073,0.217041,0.000535,0.139160,0.000389,0.079956,0.000515,0.068176,...,211,0.0,0.0,0.0,0.079956,0.574707,0.545410,0.556641,0.377441,0.177612
