## Libraries

In [1]:
import random
import pickle

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# models
import sklearn
import sktime
import keras
import statsmodels.api as sm
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from tensorflow import keras
from tensorflow.keras import layers

# model selection and metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

# constants
PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']
TEST_SPLIT_DATE = '2021-04-30'

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

## Auxiliary Functions

In [3]:
def plot_time_series(df, groupby = 'median', cols = TARGET_COLS):
    df_melted = df.groupby('Dt').median()[cols]\
    .melt(var_name='target', value_name='value', ignore_index=False)
    df_melted.reset_index(inplace=True)

    sns.set(rc={'figure.figsize':(10,20)})
    sns.FacetGrid(df_melted, col='target', col_wrap=2, height=9,aspect=2, sharey=False)\
    .map(sns.lineplot, 'Dt', 'value');
    
def plot_train_test_pred(train, test, pred = pd.DataFrame(), groupby = 'median', cols = TARGET_COLS):
    train['type'] = 'train'
    test['type'] = 'test'
    pred['type'] = 'pred'
    
    cols.append('type')

    df = pd.concat([train, test, pred], axis=0)
    df_melted = df.groupby('Dt').median()[cols]\
        .melt(var_name='target', value_name='value', ignore_index=False)
    df_melted.reset_index(inplace=True)
    
    sns.set(rc={'figure.figsize':(10,20)})
    sns.FacetGrid(df_melted, col='target', col_wrap=2, height=9,aspect=2, sharey=False)\
        .map(sns.lineplot, 'Dt', 'value', hue='type');

## Data Prep

In [4]:
#df = pd.read_pickle(PROCESSED_DATA_PATH + 'shifted_targets.pkl')

In [5]:
#train = df[(df.Dt <= "2021-01-31") & (df.Dt >= "2018-01-01")] 
#val = df[(df.Dt <= "2021-04-30") & (df.Dt >= "2021-02-01")] 
#test = df[(df.Dt <= "2021-07-31") & (df.Dt >= "2021-05-01")]

#train.to_csv('train.csv', index=None)
#val.to_csv('validation.csv', index=None) 
#test.to_csv('test.csv', index=None)

train = pd.read_csv('train.csv')
val = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

## Baselines

In [7]:
summary = pd.DataFrame(columns=TARGET_COLS,index=['Média','Média por Jogador','Mediana','Mediana por Jogador','Naive'])
# train + validation dataset
train_val = pd.concat([train, val], axis=0)

In [13]:
train.shape[0] + val.shape[0] == train_val.shape[0]

True

In [14]:
media = train_val[TARGET_COLS].mean()
media_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].mean()
mediana = train_val[TARGET_COLS].median()
mediana_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].median()
naive = train_val[train_val['Dt']=='2021-04-30'].set_index('IdPlayer')[TARGET_COLS]

In [15]:
for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    
    summary.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
    summary.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
    summary.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
    summary.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    
summary['Average'] = summary.mean(axis=1)

In [16]:
summary

Unnamed: 0,target1,target2,target3,target4,Average
Média,1.126843,2.739029,1.068968,1.477766,1.603152
Média por Jogador,0.939999,2.251019,0.9543,1.025011,1.292582
Mediana,0.712801,1.651943,0.498075,1.139852,1.000668
Mediana por Jogador,0.702606,1.56062,0.493126,0.925954,0.920577
Naive,1.168903,1.808041,0.761283,1.520494,1.31468
