> Fundação Getúlio Vargas - RJ <br>
> Escola de Matemática Aplicada (EMAp) <br>
> Graduação em Ciência de Dados e Inteligência Artificial <br>
> Alunos: Gianlucca Devigili e Maisa de O. Fraiz <br>
# Projetos em Ciência de Dados - A2

## Introdução

## Instruções de Execução

In [1]:
# Variáveis de configuração
# Variável que define se o dataset será carregado de um .csv ou de um .pkl
load_from_csv = False

# Caminho para o dataset
raw_data_path = '../data/raw-data/'
dataset_path = raw_data_path + 'train_updated.csv'

# Caminho onde serão salvos os dados processados
processed_data_path = '../data/processed-data/'
processed_dataset_path = processed_data_path + 'train.pkl' 

# Prepare data
# Variável que define se o dataset será processado ou se será carregado de um .pkl
prepare_data = True

# Varíavel que define se vai realizar o feature selection ou vai carregar de um .pkl
feature_selection = False

## Setup Inicial

In [2]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

from pandas.api.types import is_datetime64_any_dtype as is_datetime

from functions import *

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error

from sklearn.multioutput import MultiOutputRegressor


# disable warnings
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'


In [3]:
# variáveis globais

PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'

TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

## Funções auxiliares

Comentar que tá em outro arquivo

## Carga e Tratamento de Dados

In [5]:
# dataframes
dataset_names = {
    'Awards': 'awards.csv', 
    'Example': 'example_test.csv', 
    'Players': 'players.csv',
    'Seasons': 'seasons.csv', 
    'Teams': 'teams.csv', 
    'Train': 'train_updated.csv'
}
for key in dataset_names:
    dataset_names[key] = raw_data_path + dataset_names[key]

### Carregando o dataset Train Updated

In [23]:
%%time
if load_from_csv:
    df_train = pd.read_csv(dataset_names['Train'])
    pd.to_pickle(df_train, raw_data_path + 'train.pkl')
else:
    df_train = pd.read_pickle(raw_data_path + 'train.pkl')

Wall time: 2min 18s


#### Targets

In [7]:
%%time
if prepare_data:

    # criação do dataset de targets

    # unpack the data
    Y = unpack_data(df_train, dfs = ['nextDayPlayerEngagement'])['nextDayPlayerEngagement']

    # change datatypes
    Y = Y.astype({name: np.float32 for name in ["target1", "target2", "target3", "target4"]})

    # match target dates to feature dates and create date index
    Y = Y.rename(columns={'engagementMetricsDate': 'date'})

    # change datatypes
    Y['date'] = pd.to_datetime(Y['date'])

    # reset index
    Y = Y.set_index('date').to_period('D')
    Y.index = Y.index - 1
    Y = Y.reset_index()

    # rename and select columns
    cols_Y = {
        'date': 'Dt',
        'playerId': 'IdPlayer',
        'target1': 'target1',
        'target2': 'target2',
        'target3': 'target3',
        'target4': 'target4'
    }
    Y = Y[list(cols_Y)]
    Y.columns = list(cols_Y.values())
    Y['Dt'] = Y['Dt'].astype('datetime64[ns]')
    Y = create_id(Y, ['Dt', 'IdPlayer'], 'Player')

    pd.to_pickle(Y, processed_data_path + 'targets.pkl')

    del Y

Wall time: 5min 10s


#### PlayerBoxScores

In [24]:
%%time
if prepare_data:
    # load the data
    df_playerBoxScores = unpack_data(df_train, dfs = ['playerBoxScores'])['playerBoxScores']

    # Cria o dataset de jogos
    cols = {
        # columns related to other dimensions
        'gamePk': 'IdGame',
        'gameDate': 'DtGame',
        'gameTimeUTC': 'DtGameUTC',
        'playerId': 'IdPlayer',
        'teamId': 'IdTeam',
        'jerseyNum': 'NuJersey',
        'positionCode': 'CdPosition',
        # suggested column
        'strikeOutsPitching': 'NuStrikeOutsPitching',
    }  
    # numeric columns
    for numeric_col in list(df_playerBoxScores.columns[12:]):
        # skip the columns that contains data about pitching due the amount of Nan values
        if 'Pitching' not in numeric_col:
            cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:] + '_Player'

    df_playerBoxScores = df_playerBoxScores[list(cols)]
    df_playerBoxScores.columns = list(cols.values())

    # df_playerBoxScores['DtGame'] = df_playerBoxScores['DtGame'] + " 00:00:00"
    df_playerBoxScores['DtGame'] = df_playerBoxScores['DtGame'].astype('datetime64[ns]')
    df_playerBoxScores['DtGameUTC'] = df_playerBoxScores['DtGameUTC'].astype('datetime64[ns]')
    df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdPlayer'], 'Player')
    df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdTeam'], 'Team')
    df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdGame'], 'Game')

    # Salva o dataset
    pd.to_pickle(df_playerBoxScores, processed_data_path + 'playerBoxScores.pkl')

    del df_playerBoxScores

Wall time: 5min 57s
Compiler : 1.29 s
Parser   : 647 ms


#### Team Box Scores

In [25]:
%%time
if prepare_data:
    df_tbs = unpack_data(df_train, dfs = ['teamBoxScores'])['teamBoxScores']

    cols = {
        'gameDate': 'DtGame',
        'teamId': 'IdTeam',
        'home': 'FlgHome',
        'gamePk': 'IdGame',
    }

    for numeric_col in list(df_tbs.columns[4:]):
        cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:] + '_Team'

    df_tbs = df_tbs[list(cols)]
    df_tbs.columns = list(cols.values())

    df_tbs['DtGame'] = df_tbs['DtGame'].astype('datetime64[ns]')
    df_tbs = create_id(df_tbs, ['DtGame', 'IdTeam'], 'Team')
    df_tbs = create_id(df_tbs, ['DtGame', 'IdGame'], 'Game')
    df_tbs['FlgHome'] = df_tbs['FlgHome'].astype('bool')

    pd.to_pickle(df_tbs, processed_data_path + 'teamBoxScores.pkl')

    del df_tbs

Wall time: 4.34 s


#### Games

In [26]:
%%time
if prepare_data:
    df_games = unpack_data(df_train, dfs = ['games'])['games']

    cols = {
        'gamePk': 'IdGame',
        'gameType': 'CdGameType',
        'season': 'NuSeason',
        'gameDate': 'DtGame',
        'codedGameState': 'CdGameState',
        'detailedGameState': 'CdGameState',
        'isTie': 'FlgTie',
        'gameNumber': 'NuGame',
        'doubleHeader': 'CdDoubleHeader',
        'dayNight': 'CdDayNight',
        'scheduledInnings': 'NuScheduledInnings',
        'gamesInSeries': 'NuGamesInSeries',
        'homeId': 'IdHomeTeam',
        'homeWins': 'NuWinsHomeTeam',
        'homeLosses': 'NuLossesHomeTeam',
        'homeWinPct': 'NuWinPctHomeTeam',
        'homeWinner': 'FlgWinnerHomeTeam',
        'homeScore': 'NuScoreHomeTeam',
        'awayId': 'IdAwayTeam',
        'awayWins': 'NuWinsAwayTeam',
        'awayLosses': 'NuLossesAwayTeam',
        'awayWinPct': 'NuWinPctAwayTeam',
        'awayWinner': 'FlgWinnerAwayTeam',
        'awayScore': 'NuScoreAwayTeam',
    }

    df_games = df_games[list(cols)]
    df_games.columns = list(cols.values())

    df_games['DtGame'] = df_games['DtGame'].astype('datetime64[ns]')
    df_games = create_id(df_games, ['DtGame', 'IdGame'], 'Game')

    pd.to_pickle(df_games, processed_data_path + 'games.pkl')

    del df_games

Wall time: 4.45 s
Compiler : 370 ms


### Tratamento dos dados

#### Carregando os dados

In [37]:
%%time
if prepare_data:
    df_targets = pd.read_pickle(processed_data_path + 'targets.pkl')
    df_pbs = pd.read_pickle(processed_data_path + 'playerBoxScores.pkl')
    df_tbs = pd.read_pickle(processed_data_path + 'teamBoxScores.pkl')
    df_g = pd.read_pickle(processed_data_path + 'games.pkl')

#### Reduzindo memória

In [39]:
%%time
if prepare_data:
    df_targets = reduce_mem_usage(df_targets);
    df_pbs = reduce_mem_usage(df_pbs);
    df_tbs = reduce_mem_usage(df_tbs);

Memory usage of dataframe is 71.99 MB
Memory usage after optimization is: 71.99 MB
Decreased by 0.0%
Memory usage of dataframe is 33.74 MB
Memory usage after optimization is: 33.74 MB
Decreased by 0.0%
Memory usage of dataframe is 1.43 MB
Memory usage after optimization is: 1.43 MB
Decreased by 0.0%


#### Shifting

In [29]:
%%time
if prepare_data:
    sort_df(df_targets)
    df_train = shift_targets(df_targets, shift_vals=[1, 2, 3, 4, 5, 6, 7])

Wall time: 3min 40s


### Feature Engineering

#### Datetime

In [30]:
%%time
if prepare_data:
    # Transform the datetime col into new features
    df_train['DtYear'] = df_train['Dt'].dt.year
    df_train['DtMonth'] = df_train['Dt'].dt.month
    df_train['DtDay'] = df_train['Dt'].dt.day
    df_train['DtDayOfWeek'] = df_train['Dt'].dt.dayofweek
    df_train['DtDayOfYear'] = df_train['Dt'].dt.dayofyear
    df_train['DtQuarter'] = df_train['Dt'].dt.quarter
    # get the hour and minute from the PBS
    df_pbs['DtHour'] = df_pbs['DtGameUTC'].dt.hour
    df_pbs['DtMinute'] = df_pbs['DtGameUTC'].dt.minute
    df_pbs['DtHour'][df_pbs['DtHour'] == 0] = 24

### Unindo os datasets

#### Player Box Scores

In [31]:
%%time
if prepare_data:
    df_train = pd.merge(df_train, df_pbs, on=['IdDtPlayer'], how='left')

# Substitui os valores Nan das seguintes colunas por 0
    f = [c for c in df_train.columns if c not in ['IdGame', 'DtGame', 'DtGameUTC',
    'IdPlayer_y','IdTeam','NuJersey','CdPosition', 'target1_shift_1', 'target2_shift_1',
    'target3_shift_1','target1_shift_2', 'target3_shift_2', 'target4_shift_2',
    'target1_shift_3','target2_shift_3','target3_shift_3','target4_shift_3',
    'target1_shift_4','target2_shift_4','target3_shift_4','target4_shift_4',
    'target1_shift_5','target2_shift_5','target3_shift_5','target4_shift_5',
    'target1_shift_6','target2_shift_6','target3_shift_6','target4_shift_6',
    'target1_shift_7','target2_shift_7','target3_shift_7','target4_shift_7']]

    df_train[f] = df_train[f].fillna(0)        

    # Remove os na das seguintes colunas
    df_train = df_train.dropna(subset=[             
        'target1_shift_1', 'target2_shift_1', 'target3_shift_1', 'target1_shift_2',
        'target3_shift_2', 'target4_shift_2', 'target1_shift_3', 'target2_shift_3',
        'target3_shift_3', 'target4_shift_3', 'target1_shift_4', 'target2_shift_4',
        'target3_shift_4', 'target4_shift_4', 'target1_shift_5', 'target3_shift_5',
        'target4_shift_5', 'target1_shift_6', 'target2_shift_6', 'target3_shift_6',
        'target4_shift_6', 'target1_shift_7', 'target2_shift_7', 'target3_shift_7',
        'target4_shift_7'])

    df_train.rename(columns={'IdPlayer_x': 'IdPlayer'}, inplace=True)

#### Team Box Scores

In [32]:
%%time
if prepare_data:
    df_train = pd.merge(df_train, df_tbs, on = ['IdDtTeam'], how = 'left')

    # Substitui os valores Nan das seguintes colunas por 0
    f = [
        'FlgHome','NuFlyOuts_Team', 'NuGroundOuts_Team', 'NuRunsScored_Team',
        'NuDoubles_Team', 'NuTriples_Team', 'NuHomeRuns_Team', 'NuStrikeOuts_Team',
        'NuBaseOnBalls_Team', 'NuIntentionalWalks_Team', 'NuHits_Team', 'NuHitByPitch_Team',
        'NuAtBats_Team', 'NuCaughtStealing_Team', 'NuStolenBases_Team', 'NuGroundIntoDoublePlay_Team',
        'NuGroundIntoTriplePlay_Team', 'NuPlateAppearances_Team', 'NuTotalBases_Team', 'NuRbi_Team',
        'NuLeftOnBase_Team', 'NuSacBunts_Team', 'NuSacFlies_Team', 'NuCatchersInterference_Team',
        'NuPickoffs_Team', 'NuAirOutsPitching_Team', 'NuGroundOutsPitching_Team', 'NuRunsPitching_Team',
        'NuDoublesPitching_Team', 'NuTriplesPitching_Team', 'NuHomeRunsPitching_Team',
        'NuStrikeOutsPitching_Team', 'NuBaseOnBallsPitching_Team', 'NuIntentionalWalksPitching_Team',
        'NuHitsPitching_Team', 'NuHitByPitchPitching_Team', 'NuAtBatsPitching_Team',
        'NuCaughtStealingPitching_Team', 'NuStolenBasesPitching_Team', 'NuInningsPitched_Team',
        'NuEarnedRuns_Team', 'NuBattersFaced_Team', 'NuOutsPitching_Team', 'NuHitBatsmen_Team',
        'NuBalks_Team', 'NuWildPitches_Team', 'NuPickoffsPitching_Team', 'NuRbiPitching_Team',
        'NuInheritedRunners_Team', 'NuInheritedRunnersScored_Team', 'NuCatchersInterferencePitching_Team',
        'NuSacBuntsPitching_Team', 'NuSacFliesPitching_Team'
    ]

    df_train[f] = df_train[f].fillna(0)        

    df_train = df_train.rename(columns={'IdDtGame_y': 'IdDtGame'})

#### Games

In [33]:
%%time
if prepare_data:
    df_train = pd.merge(df_train, df_g, on = ['IdDtGame'], how = 'left')

    f = [
        'NuSeason', 'NuGame',
        'NuScheduledInnings', 'NuGamesInSeries', 'NuWinsHomeTeam',
        'NuLossesHomeTeam', 'NuWinPctHomeTeam', 'NuScoreHomeTeam',
        'NuWinsAwayTeam', 'NuLossesAwayTeam', 'NuWinPctAwayTeam', 'NuScoreAwayTeam'
    ]
    df_train[f] = df_train[f].fillna(0)        

    df_train = pd.get_dummies(df_train, columns = ['CdPosition', "CdGameType", "CdGameState", "CdDoubleHeader", 
                                        "CdDayNight", "FlgWinnerHomeTeam", "FlgWinnerAwayTeam",'FlgTie', 'FlgHome'])

### Drop colunas

In [34]:
# Dropa colunas com vários valores Nan
%%time
if prepare_data:
    df_train.drop([
    'IdGame_x', 'DtGame_x', 'DtGameUTC', 'IdPlayer_y',
    'IdTeam_x', 'IdTeam_y', "NuGameTimeUTC_Team", "IdDtGame", "DtGame", "IdGame",
    'IdGame_y', 'NuJersey', "DtGame_y", "IdHomeTeam", "IdAwayTeam", "IdDtPlayer",
    "IdDtTeam", "IdDtGame_x"], axis = 1, inplace = True)

### Salvando datasets

In [35]:
%%time
if prepare_data:
    df_train_reduced = reduce_mem_usage(df_train[df_train.columns.difference([TARGET_COLS])]);
    df_train_reduced[TARGET_COLS]= df_train[TARGET_COLS]

    df_train = df_train_reduced.copy()

    del df_train_reduced

    df_targets.to_pickle(processed_data_path + 'targets.pkl')
    df_pbs.to_pickle(processed_data_path + 'playerBoxScores.pkl')
    df_tbs.to_pickle(processed_data_path + 'teamBoxScores.pkl')
    df_g.to_pickle(processed_data_path + 'games.pkl')

    df_train.to_pickle(processed_data_path + 'train.pkl')
    
    del df_train, df_pbs, df_tbs, df_g, df_targets

Memory usage of dataframe is 2190.92 MB
Memory usage after optimization is: 981.16 MB
Decreased by 55.2%


## Modelos

### Lê os dados

In [4]:
df_train = pd.read_pickle(processed_dataset_path)

### Preparação e Feature Selection

In [5]:
# Arruma float16 para conseguir rodar baselines
df_train[TARGET_COLS] = df_train[TARGET_COLS].astype('float', copy = True)

In [6]:
if feature_selection:
    
    df_train = df_train.drop(columns = ['IdPlayer'])

    train, test, val = train_test_split(df_train)

    train['Dt'] = pd.to_numeric(pd.to_datetime(train['Dt']))
    test['Dt']= pd.to_numeric(pd.to_datetime(test['Dt']))
    val['Dt'] = pd.to_numeric(pd.to_datetime(val['Dt']))

    # defines the model and create the multioutput regressor
    model = GradientBoostingRegressor(random_state=RANDOM_SEED, loss = 'absolute_error')
    model_name = 'GradientBoostingRegressor | MultiOutput'
    regressor = MultiOutputRegressor(model, n_jobs=-1)

    # fit the model
    regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

    # make predictions
    y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

    for i in range(len(TARGET_COLS)):
        plt.bar(range(len(regressor.estimators_[i].feature_importances_)), regressor.estimators_[i].feature_importances_)
        plt.show()
        for i in range(len(TARGET_COLS)):
            plt.bar(range(len(regressor.estimators_[i].feature_importances_)), regressor.estimators_[i].feature_importances_)

    selected_features = pd.array([])
    for i in range(len(TARGET_COLS)):
        features = np.where(regressor.estimators_[i].feature_importances_ >= 0.005)
        selected_features = np.append(selected_features, train.drop(columns=TARGET_COLS).columns[features])
        selected_features.extend(list(np.where(regressor.estimators_[i].feature_importances_ > 0.0001)))
    np.where(regressor.estimators_[0].feature_importances_ > 0.0)
    len(np.unique(selected_features))

    with open('features.pkl', 'wb') as file:
         pickle.dump(selected_features, file)
    pickle.load(open('features.pkl', 'rb'))
    
else:
    with open('features.pkl', 'rb') as f:
        selected_features = pkl.load(f)
    df_train = df_train[np.append(np.append(TARGET_COLS, selected_features), 'IdPlayer')]

### Train Test Split

In [7]:
train, test, val = train_test_split(df_train)

train['Dt'] = pd.to_numeric(pd.to_datetime(train['Dt']))
test['Dt']= pd.to_numeric(pd.to_datetime(test['Dt']))
val['Dt'] = pd.to_numeric(pd.to_datetime(val['Dt']))

## Modelagem

In [8]:
df_results = pd.DataFrame(columns = ['model', 'target1 | AMAE', 'target2 | AMAE', 'target3 | AMAE', 'target4 | AMAE', 'average | AMAE', 'average | MAE'])

### Decisão de métrica

Texto sobre a decisão de métrica (não tem necessidade de incluir o código testando as métricas, acho)

### Baseline

In [11]:
%%time
# naive
naive = train[train['Dt']=='2021-04-30'].set_index('IdPlayer')[TARGET_COLS]

# media
media = train[TARGET_COLS].mean()
media_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].mean()

# mediana
mediana = train[TARGET_COLS].median()
mediana_por_jogador = train.groupby('IdPlayer')[TARGET_COLS].median()

summary = pd.DataFrame()
temp = pd.DataFrame()

for target in TARGET_COLS:
    
    y_true = test[target]
   
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
  
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    temp.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
    temp.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
    temp.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
    temp.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
    #temp.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    summary.loc['Média',target + " | AMAE"]  =  AMAE(y_true,media_pred,show = False)
    summary.loc['Média por Jogador',target + " | AMAE"]  =  AMAE(y_true,mediapj_pred,show = False)
    summary.loc['Mediana',target + " | AMAE"]  =  AMAE(y_true,mediana_pred,show = False)
    summary.loc['Mediana por Jogador',target + " | AMAE"]  =  AMAE(y_true,medianapj_pred,show = False)
    #summary.loc['Naive',target + " | AMAE"]  =  AMAE(y_true,naive_pred,show = False)
    
summary['average | MAE'] = temp.mean(axis=1)
summary['average | AMAE'] = summary.mean(axis=1)

summary = summary.reset_index()
summary = summary.rename(columns = {"index": "model"})

df_results = df_results.append(summary, ignore_index = True)
df_results

Wall time: 8min 32s
Compiler : 1.07 s
Parser   : 158 ms


Unnamed: 0,model,target1 | AMAE,target2 | AMAE,target3 | AMAE,target4 | AMAE,average | AMAE,average | MAE
0,Média,6843.978819,6398.583724,7121.952787,6823.816072,5437.987665,1.606922
1,Média por Jogador,6414.161377,5524.151229,6761.46272,6119.308259,4964.079988,1.316354
2,Mediana,6900.77556,6587.051921,7190.445719,6912.163273,5518.29097,1.018378
3,Mediana por Jogador,6833.931392,6005.371015,7149.41096,6419.212379,5281.77421,0.945304
4,Média,6843.978819,6398.583724,7121.952787,6823.816072,5437.987665,1.606922
5,Média por Jogador,6414.161377,5524.151229,6761.46272,6119.308259,4964.079988,1.316354
6,Mediana,6900.77556,6587.051921,7190.445719,6912.163273,5518.29097,1.018378
7,Mediana por Jogador,6833.931392,6005.371015,7149.41096,6419.212379,5281.77421,0.945304


### Modelos Lineares

In [None]:
df_train = df_train.drop(columns = ['IdPlayer'])

#### LASSO

In [None]:
%%time
from sklearn.linear_model import Lasso

# defines the model and create the multioutput regressor
model = Lasso(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Lasso | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

#### Ridge

In [None]:
%%time
from sklearn.linear_model import Ridge

# defines the model and create the multioutput regressor
model = Ridge(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'Ridge | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)

# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

#### ElasticNet

In [None]:
%%time
from sklearn.linear_model import ElasticNet

# defines the model and create the multioutput regressor
model = ElasticNet(alpha=0.1, random_state=RANDOM_SEED)
model_name = 'ElasticNet | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)


# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

### Tree Models

#### Gradient Boosting

In [None]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

# defines the model and create the multioutput regressor
model = GradientBoostingRegressor(random_state=RANDOM_SEED, loss = 'absolute_error')
model_name = 'GradientBoostingRegressor | MultiOutput'
regressor = MultiOutputRegressor(model, n_jobs=-1)

# fit the model
regressor.fit(train.drop(columns=TARGET_COLS), train[TARGET_COLS])

# make predictions
y_pred = pd.DataFrame(regressor.predict(test.drop(columns=TARGET_COLS)), columns=TARGET_COLS)


# evaluate the model
mae = evaluate_mae(test[TARGET_COLS], y_pred)
amae = evaluate_amae(test[TARGET_COLS], y_pred)
result_dict = {**amae, **{f'average | MAE': mae['average | MAE']}}

# save the results
df_results = df_results.append({'model': model_name, **result_dict}, ignore_index=True)
df_results[df_results['model'] == model_name]
df_results

### Multi Output Chaining

[note] adicionar aqui o teste falho de multi output chaining, explicar porque deu errado e mostrar que os resultados de um target não dependem dos demais. Além disso adicionar a correlação.

###### Multi Output Regressor e Modelos Finais