# Multitask Models

Testa modelos de multitasking

## Configuração de Ambiente

In [1]:
# Variáveis de Configuração

# Caminhos de arquivo
raw_data_path = '../data/raw-data/'                 # dados não processados
raw_dataset_path = raw_data_path + 'train_updated.csv'
processed_data_path = '../data/processed-data/'     # dados processados
processed_dataset_path = processed_data_path + 'train.pkl' 
model_path = '../models/trained-models/'

In [2]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

from models import *

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# Variáveis/Constantes Globais
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

In [4]:
# Carregando o dataframe
df_train = pd.read_pickle(processed_dataset_path)
df_train.head()

Unnamed: 0,Dt,IdPlayer,target1,target2,target3,target4,target1_shift_1,target2_shift_1,target3_shift_1,target4_shift_1,...,NuWildPitches,NuInheritedRunners,NuInheritedRunnersScored,NuSaves,NuHolds,NuBlownSaves,NuAssists,NuPutOuts,NuErrors,NuChances
7,2018-01-08,112526,0.032349,4.394531,0.039185,17.625,0.014557,4.714844,0.026337,16.171875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2018-01-09,112526,0.016098,1.791016,0.05542,4.628906,0.032349,4.394531,0.039185,17.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2018-01-10,112526,0.075623,1.867188,0.023239,8.507812,0.016098,1.791016,0.05542,4.628906,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,2018-01-11,112526,0.035004,2.285156,0.020096,7.941406,0.075623,1.867188,0.023239,8.507812,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2018-01-12,112526,0.018204,0.624023,0.007935,4.925781,0.035004,2.285156,0.020096,7.941406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preparação dos Dados

Manter no notebook apenas a preparação pontual dos dados para a aplicação em modelos, toda preparação permanente deve ser adicionada em `/src/data-engineering/data-engineering.ipynb` e então salvo o dataset processado em `/src/data/processed-data/`.

### Funções Auxiliares

Apenas manter aqui funções que serão utilizadas pontualmente no notebook, em caso de funções que serão usadas em outros notebooks, deve-se transferir as mesmas para um arquivo `.py` separado e importar as funções para o notebook.

In [5]:
# Funções Auxiliares

# Imports
from data_preparation import *

# Funções Auxiliares


### Preparação

### Train Test Split

In [6]:
train, test, val = train_test_split(df_train)

In [7]:
train['Dt'] = pd.to_numeric(pd.to_datetime(train['Dt']))
test['Dt']= pd.to_numeric(pd.to_datetime(test['Dt']))
val['Dt'] = pd.to_numeric(pd.to_datetime(val['Dt']))

## Treinando Modelos

In [8]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.multioutput import MultiOutputRegressor

df_results = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])

### Baselines

In [9]:
# naive
naive = train[train['Dt']=='2021-04-30'].set_index('IdPlayer')[TARGET_COLS]

# media
media = train[TARGET_COLS].mean()
media_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].mean()

# mediana
mediana = train[TARGET_COLS].median()
mediana_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].median()

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [10]:
# summary = pd.DataFrame()

# for target in TARGET_COLS:
    
#     y_true = test[target]
    
#     mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
#     medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
#     naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
#     mediana_pred = [mediana[target] for i in test.index]
#     media_pred = [media[target] for i in test.index]
    

#     summary.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
#     summary.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
#     summary.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
#     summary.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
#     summary.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    
# summary['average'] = summary.mean(axis=1)

# summary = summary.reset_index()
# summary = summary.rename(columns = {"index": "model"})
# df_results = df_results.append(summary, ignore_index = True)
# df_results

### Modelos Lineares

#### LASSO

In [11]:
%%time
from sklearn.linear_model import Lasso

# defines the model and create the multioutput regressor
model = Lasso(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Lasso | MultiOutput'
regressor = MultiOutputRegressor(model)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)

# save the results
df_results = df_results.append({'model': model_name, **mae}, ignore_index=True)
df_results[df_results['model'] == model_name]

CPU times: total: 11min 8s
Wall time: 4min 31s


Unnamed: 0,model,target1,target2,target3,target4,average
0,Lasso | MultiOutput,0.780746,1.344433,0.703082,0.74622,0.89362


#### Ridge

In [12]:
from sklearn.linear_model import Ridge

# defines the model and create the multioutput regressor
model = Ridge(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Ridge | MultiOutput'
regressor = MultiOutputRegressor(model)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)

# save the results
df_results = df_results.append({'model': model_name, **mae}, ignore_index=True)
df_results[df_results['model'] == model_name]

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Unnamed: 0,model,target1,target2,target3,target4,average
1,Ridge | MultiOutput,0.784643,1.331393,0.729669,0.756929,0.900659


#### ElasticNet

In [13]:
%%time
from sklearn.linear_model import ElasticNet

# defines the model and create the multioutput regressor
model = ElasticNet(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'ElasticNet | MultiOutput'
regressor = MultiOutputRegressor(model)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)

# save the results
df_results = df_results.append({'model': model_name, **mae}, ignore_index=True)
df_results[df_results['model'] == model_name]

CPU times: total: 7min 31s
Wall time: 3min 38s


Unnamed: 0,model,target1,target2,target3,target4,average
2,ElasticNet | MultiOutput,0.782295,1.34296,0.710303,0.747843,0.89585


### Tree Models

In [14]:
%%time
from sklearn.tree import DecisionTreeRegressor

# defines the model and create the multioutput regressor
model = DecisionTreeRegressor(random_state=RANDOM_SEED)
model_name = 'DecisionTreeRegressor | MultiOutput'
regressor = MultiOutputRegressor(model)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)

# save the results
df_results = df_results.append({'model': model_name, **mae}, ignore_index=True)
df_results[df_results['model'] == model_name]

CPU times: total: 12min 10s
Wall time: 16min 22s


Unnamed: 0,model,target1,target2,target3,target4,average
3,DecisionTreeRegressor | MultiOutput,2.361119,2.124396,1.964909,1.574683,2.006277


#### Gradient Boosting

In [15]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

# defines the model and create the multioutput regressor
model = GradientBoostingRegressor(random_state=RANDOM_SEED)
model_name = 'GradientBoostingRegressor | MultiOutput'
regressor = MultiOutputRegressor(model)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)

# save the results
df_results = df_results.append({'model': model_name, **mae}, ignore_index=True)
df_results[df_results['model'] == model_name]

CPU times: total: 1h 46min 58s
Wall time: 2h 6min 14s


Unnamed: 0,model,target1,target2,target3,target4,average
4,GradientBoostingRegressor | MultiOutput,0.73111,1.066651,0.666981,0.726966,0.797927


## Resultados

In [16]:
df_results

Unnamed: 0,model,target1,target2,target3,target4,average
0,Lasso | MultiOutput,0.780746,1.344433,0.703082,0.74622,0.89362
1,Ridge | MultiOutput,0.784643,1.331393,0.729669,0.756929,0.900659
2,ElasticNet | MultiOutput,0.782295,1.34296,0.710303,0.747843,0.89585
3,DecisionTreeRegressor | MultiOutput,2.361119,2.124396,1.964909,1.574683,2.006277
4,GradientBoostingRegressor | MultiOutput,0.73111,1.066651,0.666981,0.726966,0.797927
