# Métrica

Testando métricas nas baselines

## Configuração de Ambiente

In [2]:
# Variáveis de Configuração

# Caminhos de arquivo
raw_data_path = '../data/raw-data/'                 # dados não processados
raw_dataset_path = raw_data_path + 'train_updated.csv'
processed_data_path = '../data/processed-data/'     # dados processados
processed_dataset_path = processed_data_path + 'train.pkl' 
model_path = '../models/trained-models/'

In [3]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

# Metrics
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_squared_log_error, max_error

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
# Variáveis/Constantes Globais
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

## Preparação dos Dados

Manter no notebook apenas a preparação pontual dos dados para a aplicação em modelos, toda preparação permanente deve ser adicionada em `/src/data-engineering/data-engineering.ipynb` e então salvo o dataset processado em `/src/data/processed-data/`.

### Funções Auxiliares

Apenas manter aqui funções que serão utilizadas pontualmente no notebook, em caso de funções que serão usadas em outros notebooks, deve-se transferir as mesmas para um arquivo `.py` separado e importar as funções para o notebook.

In [6]:
# Imports

from data_preparation import *

In [7]:
# Funções Auxiliares
pass

### Carregando os dados

In [6]:
%%time
df = pd.read_pickle('../data/processed-data/targets.pkl')

Wall time: 1.87 s


### Preparação

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2695788 entries, 0 to 2695787
Data columns (total 7 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Dt          datetime64[ns]
 1   IdPlayer    int32         
 2   target1     float16       
 3   target2     float16       
 4   target3     float16       
 5   target4     float16       
 6   IdDtPlayer  object        
dtypes: datetime64[ns](1), float16(4), int32(1), object(1)
memory usage: 72.0+ MB


In [8]:
df[TARGET_COLS] = df[TARGET_COLS].astype('float', copy = True)

In [9]:
train, test, val = train_test_split(df)
print(f"Train shape: {train.shape}, Test shape: {test.shape}, Val shape: {val.shape}")

del df

Train shape: (2322747, 7), Test shape: (189612, 7), Val shape: (183429, 7)


## Treinando Modelos

Para o treinamento dos modelos, seguir o template abaixo:
```py
from sklearn.lib_name import ModelName

model = ModelName(**params)

model = # FALTA ALTERAÇÃO
```

In [17]:
df_results_mae = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])
df_results_r2 = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])
df_results_mse = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])
df_results_msle = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])
df_results_max = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])
df_results_amae = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])

### Mean

In [11]:
%%time
train_val = pd.concat([train, val], axis=0)

media = train_val[TARGET_COLS].mean()
media_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].mean()

Wall time: 462 ms


### Median

In [12]:
mediana = train_val[TARGET_COLS].median()
mediana_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].median()

### Naive

In [13]:
naive = train_val[train_val['Dt']=='2021-04-30'].set_index('IdPlayer')[TARGET_COLS]

## Resultados MAE

In [14]:
summary = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    summary.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
    summary.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
    summary.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
    summary.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    
summary['average'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})
df_results_mae = df_results_mae.append(summary, ignore_index = True)

In [15]:
df_results_mae

Unnamed: 0,model,target1,target2,target3,target4,average
0,Média,1.126845,2.739032,1.068969,1.477763,1.603152
1,Média por Jogador,0.94,2.251021,0.954301,1.025007,1.292582
2,Mediana,0.712802,1.651937,0.498075,1.139847,1.000665
3,Mediana por Jogador,0.702607,1.56062,0.493126,0.925952,0.920576
4,Naive,1.168903,1.808058,0.761273,1.520533,1.314692


## Resultados R2

In [52]:
summary = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    summary.loc['Média',target]  = r2_score(y_true,media_pred)
    summary.loc['Média por Jogador',target]  = r2_score(y_true,mediapj_pred)
    summary.loc['Mediana',target]  = r2_score(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  = r2_score(y_true,medianapj_pred)
    summary.loc['Naive',target]  = r2_score(y_true,naive_pred)
    
summary['average'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})
df_results_r2 = df_results_r2.append(summary, ignore_index = True)

In [53]:
df_results_r2

Unnamed: 0,model,target1,target2,target3,target4,average
0,Média,-0.001037,-0.024916,-0.001893,-0.000544,-0.007097
1,Média por Jogador,0.112572,0.12675,0.045558,0.233887,0.129692
2,Mediana,-0.025188,-0.040817,-0.013006,-0.055599,-0.033652
3,Mediana por Jogador,0.002668,0.141201,0.002468,0.183572,0.082477
4,Naive,-0.505697,-0.2586,-0.445983,-0.320686,-0.382741


## Resultados MSE

In [54]:
summary = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    summary.loc['Média',target]  = mean_squared_error(y_true,media_pred)
    summary.loc['Média por Jogador',target]  = mean_squared_error(y_true,mediapj_pred)
    summary.loc['Mediana',target]  = mean_squared_error(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  = mean_squared_error(y_true,medianapj_pred)
    summary.loc['Naive',target]  = mean_squared_error(y_true,naive_pred)
    
summary['average'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})
df_results_mse = df_results_mse.append(summary, ignore_index = True)

In [55]:
df_results_mse

Unnamed: 0,model,target1,target2,target3,target4,average
0,Média,20.14108,28.567128,19.013621,18.502043,21.555968
1,Média por Jogador,17.855252,24.339806,18.113108,14.166946,18.618778
2,Mediana,20.627006,29.010328,19.224518,19.520122,22.095494
3,Mediana por Jogador,20.066532,23.937027,18.930852,15.097365,19.507944
4,Naive,30.294957,35.080522,27.441424,24.422094,29.309749


## Resultados MSLE

In [56]:
summary = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    summary.loc['Média',target]  =  mean_squared_log_error(y_true,media_pred)
    summary.loc['Média por Jogador',target]  =  mean_squared_log_error(y_true,mediapj_pred)
    summary.loc['Mediana',target]  =  mean_squared_log_error(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  =  mean_squared_log_error(y_true,medianapj_pred)
    summary.loc['Naive',target]  =  mean_squared_log_error(y_true,naive_pred)
    
summary['average'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})
df_results_msle = df_results_msle.append(summary, ignore_index = True)

In [57]:
df_results_msle

Unnamed: 0,model,target1,target2,target3,target4,average
0,Média,0.368684,1.07059,0.361266,0.455539,0.56402
1,Média por Jogador,0.292511,0.677494,0.328938,0.211702,0.377661
2,Mediana,0.320133,0.539415,0.199391,0.448422,0.37684
3,Mediana por Jogador,0.272306,0.425363,0.17576,0.219949,0.273344
4,Naive,0.403208,0.450524,0.26442,0.346134,0.366071


## Resultados Max

In [63]:
summary = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    summary.loc['Média',target]  =  max_error(y_true,media_pred)
    summary.loc['Média por Jogador',target]  =  max_error(y_true,mediapj_pred)
    summary.loc['Mediana',target]  =  max_error(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  =  max_error(y_true,medianapj_pred)
    summary.loc['Naive',target]  =  max_error(y_true,naive_pred)
    
summary['average'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})
df_results_max = df_results_max.append(summary, ignore_index = True)

In [64]:
df_results_max

Unnamed: 0,model,target1,target2,target3,target4,average
0,Média,0.368684,1.07059,0.361266,0.455539,0.56402
1,Média por Jogador,0.292511,0.677494,0.328938,0.211702,0.377661
2,Mediana,0.320133,0.539415,0.199391,0.448422,0.37684
3,Mediana por Jogador,0.272306,0.425363,0.17576,0.219949,0.273344
4,Naive,0.403208,0.450524,0.26442,0.346134,0.366071
5,Média,99.431479,97.543886,99.311949,98.862955,98.787567
6,Média por Jogador,99.547125,99.917896,99.953355,99.603287,99.755416
7,Mediana,99.998933,99.443848,99.998308,99.776611,99.804425
8,Mediana por Jogador,99.995352,100.0,100.0,100.0,99.998838
9,Naive,99.992767,99.986687,100.0,99.795166,99.943655


## Resultados Sugestão Grupo 1

A sugestão da equipe foi que, para cada limite $p_i$ em uma coleção finita ${p_1, p_2, ..., p_n}$, seja computado o erro absoluto médio $a_i$ do modelo apenas para amostras com engajamento maior ou igual a $p_i$ para alguma das variáveis alvo. Depois, devemos calcular a área sob a curva no plano do conjunto de pares $(p_i, a_i)$ obtida por interpolação linear.

In [67]:
for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    
    
    summary.loc['Média',target]  =  AMAE(y_true,media_pred,show = False)
    summary.loc['Média por Jogador',target]  =  AMAE(y_true,mediapj_pred,show = False)
    summary.loc['Mediana',target]  =  AMAE(y_true,mediana_pred,show = False)
    summary.loc['Mediana por Jogador',target]  =  AMAE(y_true,medianapj_pred,show = False)
    summary.loc['Naive',target]  =  AMAE(y_true,naive_pred,show = False)
    
summary['average'] = summary.mean(axis=1)
summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})
df_results_amae = df_results_amae.append(summary, ignore_index = True)

In [68]:
df_results_amae

Unnamed: 0,model,target1,target2,target3,target4,average
0,Média,6873.436067,6399.761968,7113.131961,6821.20951,6801.884876
1,Média por Jogador,6427.357098,5498.021449,6736.991975,6056.429634,6179.700039
2,Mediana,6930.09927,6587.889281,7181.618802,6912.173818,6902.945293
3,Mediana por Jogador,6861.330991,5986.027696,7138.705172,6390.660114,6594.180993
4,Naive,6164.969618,4952.154843,6764.20187,5031.239301,5728.141408
