# Models
(arranjar um nome melhor para o notebook)

Notebook para o treinamento de modelos

In [1]:
import random

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# models
import sklearn
import sktime
import keras
import statsmodels.api as sm
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from tensorflow import keras
from tensorflow.keras import layers

# model selection and metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.performance_metrics.forecasting import MeanAbsoluteError

# variables
processed_data_path = '../data/processed-data/'

2023-03-25 18:44:27.113971: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-25 18:44:27.219147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-25 18:44:27.219164: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-25 18:44:27.788334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

## Auxiliary Functions

In [2]:
def plot_time_series(df, groupby = 'median', cols = ['target1', 'target2', 'target3', 'target4']):
    df_melted = df.groupby('Dt').median()[cols]\
    .melt(var_name='target', value_name='value', ignore_index=False)
    df_melted.reset_index(inplace=True)

    sns.set(rc={'figure.figsize':(10,20)})
    sns.FacetGrid(df_melted, col='target', col_wrap=2, height=9,aspect=2, sharey=False)\
    .map(sns.lineplot, 'Dt', 'value');

In [3]:
def plot_train_test_pred(train, test, pred = pd.DataFrame(), groupby = 'median', cols = ['target1', 'target2', 'target3', 'target4']):
    train['type'] = 'train'
    test['type'] = 'test'
    pred['type'] = 'pred'
    
    cols.append('type')

    df = pd.concat([train, test, pred], axis=0)
    df_melted = df.groupby('Dt').median()[cols]\
        .melt(var_name='target', value_name='value', ignore_index=False)
    df_melted.reset_index(inplace=True)
    
    sns.set(rc={'figure.figsize':(10,20)})
    sns.FacetGrid(df_melted, col='target', col_wrap=2, height=9,aspect=2, sharey=False)\
        .map(sns.lineplot, 'Dt', 'value', hue='type');

## Data Preparation

Apenas realizar modificações necessárias para os modelos como *splitting* dos datasets e normalização. Toda a limpeza de dados deve ser feita em **/src/data-engineering/data-preparation**

### Loading the data

In [4]:
df_targets = pd.read_pickle(processed_data_path + 'targets.pkl')

### Train Test Split

In [5]:
target_cols = ['target1', 'target2', 'target3', 'target4']
train = df_targets[df_targets['Dt'] <= '2021-04-30']
y_train = train[target_cols]
x_train = train.drop(target_cols, axis=1)
test = df_targets[df_targets['Dt'] > '2021-04-30']
y_test = test[target_cols]
x_test = test.drop(target_cols, axis=1)
del df_targets

In [6]:
train.shape

(2506176, 7)

In [7]:
test.shape

(189612, 7)

### Data Normalization

## Training

In [8]:
def train_model(model, train, test, y_cols = ['target1', 'target2', 'target3', 'target4']):
    results = {}
    for col in y_cols:
        y_train = train[col].set_axis(train['Dt'])
        y_test = test[col].set_axis(test['Dt'])
        model.fit(y_train)
        pred = model.predict(y_test)
        results[col] = pred
    return results

In [9]:
def train_baseline_model(model, train, fh_size, y_cols = ['target1', 'target2', 'target3', 'target4']):
    results = {}
    fh = np.arange(len(fh_size)) + 1
    for col in y_cols:
        y_train = train[col].set_axis(train['Dt'])
        model.fit(y_train)
        pred = model.predict(fh)
        results[col] = pred
    return results

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2506176 entries, 0 to 2506175
Data columns (total 7 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Dt          datetime64[ns]
 1   IdPlayer    int64         
 2   target1     float32       
 3   target2     float32       
 4   target3     float32       
 5   target4     float32       
 6   IdDtPlayer  object        
dtypes: datetime64[ns](1), float32(4), int64(1), object(1)
memory usage: 114.7+ MB


In [11]:
# # Naive
# naive = NaiveForecaster(strategy='last').fit(train)
# y_pred_naive = naive.predict(test)

# # Mean
# mean = NaiveForecaster(strategy="mean").fit(train)
# y_pred_mean = mean.predict(test)

# # Drift
# drift = NaiveForecaster(strategy="drift").fit(train)
# y_pred_drift = drift.predict(test)

### Baseline Models

In [30]:
def get_last_value(train, dt, id_player, target):
    last_date = train['Dt'].max()
    return train[train['IdDtPlayer'] == str(last_date) + '_' + str(id_player)]
    # return train[train['Dt'] == last_date][id_player].values[0]

def NaiveForecaster(train, x_test):
    y_pred = pd.DataFrame()
    for idx, row in x_test.iterrows():
        pred_targets = {}
        id_player = row['IdPlayer']
        dt = row['Dt']
        for target in target_cols:
            pred_targets[target] = get_last_value(train, dt, id_player, target)
        y_pred = y_pred.concat(pred_targets, ignore_index=True)

    return y_pred

### testing the functions

In [21]:
print("\ntrain[train['IdPlayer'] == 663399].head(1)")
print(train[train['IdPlayer'] == 663399].head(1))
print("\ntest[test['IdPlayer'] == 663399].head(1)")
print(test[test['IdPlayer'] == 663399].head(1))
print("\ntrain[train['IdDtPlayer'] == str(train['Dt'].max()) + '_663399']")
print(train[train['IdDtPlayer'] == str(train['Dt'].max()) + '_663399'])
print("\nget_last_value(train, '2021-05-02', 663399, 'target1')")
print(get_last_value(train, '2021-05-02', 663399, 'target1'))


train[train['IdPlayer'] == 663399].head(1)
             Dt  IdPlayer  target1  target2  target3   target4  \
2059 2018-01-01    663399      0.0      0.0      0.0  0.098039   

                      IdDtPlayer  
2059  2018-01-01 00:00:00_663399  

test[test['IdPlayer'] == 663399].head(1)
                Dt  IdPlayer   target1   target2  target3   target4  \
2508214 2021-05-01    663399  0.000409  0.107825      0.0  0.158343   

                         IdDtPlayer  
2508214  2021-05-01 00:00:00_663399  

train[train['IdDtPlayer'] == str(train['Dt'].max()) + '_663399']
                Dt  IdPlayer   target1   target2  target3   target4  \
2505410 2021-04-30    663399  0.000263  0.089871      0.0  0.136565   

                         IdDtPlayer  
2505410  2021-04-30 00:00:00_663399  

get_last_value(train, '2021-05-02', 663399, 'target1')
                Dt  IdPlayer   target1   target2  target3   target4  \
2505410 2021-04-30    663399  0.000263  0.089871      0.0  0.136565   

        

In [31]:
y_pred_naive = NaiveForecaster(train, x_test)
y_pred_naive

AttributeError: 'DataFrame' object has no attribute 'concat'

### .

## Model Evaluation