# Multitask Models

Testa modelos de multitasking

## Configuração de Ambiente

In [1]:
# Variáveis de Configuração

# Caminhos de arquivo
raw_data_path = '../data/raw-data/'                 # dados não processados
raw_dataset_path = raw_data_path + 'train_updated.csv'
processed_data_path = '../data/processed-data/'     # dados processados
processed_dataset_path = processed_data_path + 'train.pkl' 
model_path = '../models/trained-models/'

In [2]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

from matplotlib import pyplot as plt

from models import *

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
warnings.filterwarnings("ignore")

In [3]:
# Variáveis/Constantes Globais
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

In [4]:
# Carregando o dataframe
df_train = pd.read_pickle(processed_dataset_path)
df_train.head()

Unnamed: 0,Dt,IdPlayer,target1,target2,target3,target4,IdDtPlayer,target1_shift_1,target2_shift_1,target3_shift_1,...,CdDoubleHeader_N,CdDoubleHeader_S,CdDoubleHeader_Y,CdDayNight_day,CdDayNight_night,FlgWinnerHomeTeam_False,FlgWinnerHomeTeam_True,FlgWinnerAwayTeam_False,FlgWinnerAwayTeam_True,FlgTie_0.0
0,2018-01-08,112526,0.032349,4.394531,0.039185,17.625,2018-01-08 00:00:00_112526,0.014557,4.714844,0.026337,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-01-09,112526,0.016098,1.791016,0.05542,4.628906,2018-01-09 00:00:00_112526,0.032349,4.394531,0.039185,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-01-10,112526,0.075623,1.867188,0.023239,8.507812,2018-01-10 00:00:00_112526,0.016098,1.791016,0.05542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-01-11,112526,0.035004,2.285156,0.020096,7.941406,2018-01-11 00:00:00_112526,0.075623,1.867188,0.023239,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-01-12,112526,0.018204,0.624023,0.007935,4.925781,2018-01-12 00:00:00_112526,0.035004,2.285156,0.020096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preparação dos Dados

Manter no notebook apenas a preparação pontual dos dados para a aplicação em modelos, toda preparação permanente deve ser adicionada em `/src/data-engineering/data-engineering.ipynb` e então salvo o dataset processado em `/src/data/processed-data/`.

### Funções Auxiliares

Apenas manter aqui funções que serão utilizadas pontualmente no notebook, em caso de funções que serão usadas em outros notebooks, deve-se transferir as mesmas para um arquivo `.py` separado e importar as funções para o notebook.

In [5]:
# Funções Auxiliares

# Imports
from data_preparation import *

# Funções Auxiliares


### Preparação

In [6]:
#Arruma float16 para conseguir rodar baselines
df_train[TARGET_COLS] = df_train[TARGET_COLS].astype('float', copy = True)

In [7]:
with open('features.pkl', 'rb') as f:
    selected_features = pkl.load(f)
df_train = df_train[np.append(np.append(TARGET_COLS, selected_features), 'IdPlayer')]
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2693260 entries, 0 to 2693259
Data columns (total 47 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   target1                  float64       
 1   target2                  float64       
 2   target3                  float64       
 3   target4                  float64       
 4   target1_shift_1          float16       
 5   target2_shift_1          float16       
 6   target3_shift_1          float16       
 7   target4_shift_1          float16       
 8   target1_shift_2          float16       
 9   target4_shift_2          float16       
 10  target1_shift_3          float16       
 11  target1_shift_4          float16       
 12  target1_shift_5          float16       
 13  NuHomeRuns_Player        float16       
 14  NuTotalBases_Player      float16       
 15  NuRbi_Player             float16       
 16  CdDayNight_night         float16       
 17  target2_shift_1          fl

### Train Test Split

In [8]:
train, test, val = train_test_split(df_train)

In [9]:
train['Dt'] = pd.to_numeric(pd.to_datetime(train['Dt']))
test['Dt']= pd.to_numeric(pd.to_datetime(test['Dt']))
val['Dt'] = pd.to_numeric(pd.to_datetime(val['Dt']))

## Treinando Modelos

In [10]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.multioutput import MultiOutputRegressor

df_results = pd.DataFrame(columns = ['model', 'target1 | AMAE', 'target2 | AMAE', 'target3 | AMAE', 'target4 | AMAE', 'average | AMAE', 'average | MAE'])

### Baselines

In [11]:
# naive
naive = train[train['Dt']=='2021-04-30'].set_index('IdPlayer')[TARGET_COLS]

# media
media = train[TARGET_COLS].mean()
media_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].mean()

# mediana
mediana = train[TARGET_COLS].median()
mediana_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].median()

In [12]:
%%time
summary = pd.DataFrame()
temp = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
   
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
  
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    temp.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
    temp.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
    temp.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
    temp.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
    #temp.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    summary.loc['Média',target + " | AMAE"]  =  AMAE(y_true,media_pred,show = False)
    summary.loc['Média por Jogador',target + " | AMAE"]  =  AMAE(y_true,mediapj_pred,show = False)
    summary.loc['Mediana',target + " | AMAE"]  =  AMAE(y_true,mediana_pred,show = False)
    summary.loc['Mediana por Jogador',target + " | AMAE"]  =  AMAE(y_true,medianapj_pred,show = False)
    #summary.loc['Naive',target + " | AMAE"]  =  AMAE(y_true,naive_pred,show = False)
    
summary['average | MAE'] = temp.mean(axis=1)
summary['average | AMAE'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})

df_results = df_results.append(summary, ignore_index = True)
df_results

### Modelos Lineares

In [None]:
df_train = df_train.drop(columns = ['IdPlayer'])

#### LASSO

In [None]:
%%time
from sklearn.linear_model import Lasso

# defines the model and create the multioutput regressor
model = Lasso(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Lasso | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

#### Ridge

In [None]:
%%time
from sklearn.linear_model import Ridge

# defines the model and create the multioutput regressor
model = Ridge(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Ridge | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

In [None]:
df_train.info()

#### ElasticNet

In [None]:
%%time
from sklearn.linear_model import ElasticNet

# defines the model and create the multioutput regressor
model = ElasticNet(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'ElasticNet | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)


# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

### Tree Models

#### Gradient Boosting

In [None]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

# defines the model and create the multioutput regressor
model = GradientBoostingRegressor(random_state=RANDOM_SEED, loss = 'absolute_error')
model_name = 'GradientBoostingRegressor | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)


# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

## Resultados

In [None]:
df_results

In [None]:
# for i in range(len(TARGET_COLS)):
#     plt.bar(range(len(regressor.estimators_[i].feature_importances_)), regressor.estimators_[i].feature_importances_)
#     plt.show()
# for i in range(len(TARGET_COLS)):
#     plt.bar(range(len(regressor.estimators_[i].feature_importances_)), regressor.estimators_[i].feature_importances_)


In [None]:
# selected_features = pd.array([])
# for i in range(len(TARGET_COLS)):
#     features = np.where(regressor.estimators_[i].feature_importances_ >= 0.005)
#     selected_features = np.append(selected_features, train.drop(columns=TARGET_COLS).columns[features])
#     # selected_features.extend(list(np.where(regressor.estimators_[i].feature_importances_ > 0.0001)))
# # np.where(regressor.estimators_[0].feature_importances_ > 0.0)
# len(np.unique(selected_features))

# with open('features.pkl', 'wb') as file:
#     pickle.dump(selected_features, file)
# pickle.load(open('features.pkl', 'rb'))