> Fundação Getúlio Vargas - RJ <br>
> Escola de Matemática Aplicada (EMAp) <br>
> Graduação em Ciência de Dados e Inteligência Artificial <br>
> Alunos: Gianlucca Devigili e Maisa de O. Fraiz <br>
# Projetos em Ciência de Dados - A2

## Introdução

## Instruções de Execução

In [1]:
# Variáveis de configuração
# (#1) Variável que define se o dataset será carregado de um .csv ou de um .pkl
load_from_csv = False

# Caminho para o dataset
raw_data_path = '../data/raw-data/'
dataset_path = raw_data_path + 'train_updated.csv'

# Caminho onde serão salvos os dados processados
processed_data_path = '../data/processed-data/'

# Prepare data
# (#4) Variável que define se o dataset será processado ou se será carregado de um .pkl
prepare_data = True

save_files = True

## Setup Inicial

In [2]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

from pandas.api.types import is_datetime64_any_dtype as is_datetime


# disable warnings
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'


In [3]:
# variáveis globais

PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'

TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

## Funções auxiliares

In [4]:
# Funções auxiliares para carregar e tratar os dados
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)

def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

def create_id(df, id_cols, id_col_name, dt_col_name = 'Dt'):
    df['Id' + dt_col_name + id_col_name] = df[id_cols].apply(lambda x: '_'.join(x.astype(str)), axis=1)
    return df


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and is_datetime(df[col]) == False and col_type != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif is_datetime(df[col]) == True:
            df[col] = df[col].astype('datetime64[ns]')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# Funções auxiliares para o pré-processamento dos dados
def sort_df(df: pd.DataFrame, columns: list = ['IdPlayer', 'Dt']) -> None:
    """Sort the dataframe by the columns passed as argument.
    
    Args:
        df (pd.DataFrame): Dataframe to be sorted.
        columns (list, optional): Columns to sort the dataframe. Defaults to ['IdPlayer', 'Dt'].
        
        Returns:
            None
    """
    df.sort_values(by=columns, inplace=True)
    # reset index
    df.reset_index(drop=True, inplace=True)


def shift_targets(df, shift_vals: list = [1, 2, 3, 4, 5, 6, 7, 14, 30]):
    """Shift the targets by the values passed as argument.

    Args:
        df (pd.DataFrame): Dataframe to be shifted.
        shift_vals (list, optional): Values to shift the targets. Defaults to [1, 2, 3, 4, 5, 6, 7, 14, 30].

    Returns:
        pd.DataFrame: Dataframe with the shifted targets.
    """
    df_aux = pd.DataFrame()
    # Iterate over players to make the shift only using the player data
    for player in df['IdPlayer'].unique():
        df_player = df[df['IdPlayer'] == player]
        # Iterate over the pre-defined shift values
        for shift_val in shift_vals:
            # Iterate over the targets
            for target in TARGET_COLS:
                # Make the shift
                df_player[f'{target}_shift_{shift_val}'] = df_player[target].shift(shift_val)
        # Concatenate the player data with the rest of the data
        df_aux = pd.concat([df_aux, df_player], axis=0)
        # Remove the player data from memory
        del df_player
    # df.dropna(inplace=True)
    return df_aux


def train_test_split(
    df: pd.DataFrame
    ,test_split_date: str = TEST_SPLIT_DATE
    ):
    """Split the dataframe into train and test sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
        test_split_date (str, optional): Date to split the dataframe. Defaults to TEST_SPLIT_DATE.
    """

    train = df[(df.Dt <= "2021-01-31") & (df.Dt >= "2018-01-01")] 
    val = df[(df.Dt <= "2021-04-30") & (df.Dt >= "2021-02-01")] 
    test = df[(df.Dt <= "2021-07-31") & (df.Dt >= "2021-05-01")]
    # train.to_csv('train.csv', index=None)
    # val.to_csv('validation.csv', index=None) 
    # test.to_csv('test.csv', index=None) 

    return train, test, val


def x_y_split(df: pd.DataFrame, target_cols: list = TARGET_COLS):
    """Split the dataframe into x and y sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
    """
    y = df[target_cols]
    x = df.drop(target_cols, axis=1)
    return x, y

## Carga e Tratamento de Dados

In [5]:
# dataframes
dataset_names = {
    'Awards': 'awards.csv', 
    'Example': 'example_test.csv', 
    'Players': 'players.csv',
    'Seasons': 'seasons.csv', 
    'Teams': 'teams.csv', 
    'Train': 'train_updated.csv'
}
for key in dataset_names:
    dataset_names[key] = raw_data_path + dataset_names[key]

### Carregando o dataset Train Updated

In [23]:
%%time
if load_from_csv:
    df_train = pd.read_csv(dataset_names['Train'])
    pd.to_pickle(df_train, raw_data_path + 'train.pkl')
else:
    df_train = pd.read_pickle(raw_data_path + 'train.pkl')

Wall time: 2min 18s


#### Targets

In [7]:
%%time
if prepare_data:

    # criação do dataset de targets

    # unpack the data
    Y = unpack_data(df_train, dfs = ['nextDayPlayerEngagement'])['nextDayPlayerEngagement']

    # change datatypes
    Y = Y.astype({name: np.float32 for name in ["target1", "target2", "target3", "target4"]})

    # match target dates to feature dates and create date index
    Y = Y.rename(columns={'engagementMetricsDate': 'date'})

    # change datatypes
    Y['date'] = pd.to_datetime(Y['date'])

    # reset index
    Y = Y.set_index('date').to_period('D')
    Y.index = Y.index - 1
    Y = Y.reset_index()

    # rename and select columns
    cols_Y = {
        'date': 'Dt',
        'playerId': 'IdPlayer',
        'target1': 'target1',
        'target2': 'target2',
        'target3': 'target3',
        'target4': 'target4'
    }
    Y = Y[list(cols_Y)]
    Y.columns = list(cols_Y.values())
    Y['Dt'] = Y['Dt'].astype('datetime64[ns]')
    Y = create_id(Y, ['Dt', 'IdPlayer'], 'Player')

    pd.to_pickle(Y, processed_data_path + 'targets.pkl')

    del Y

Wall time: 5min 10s


#### PlayerBoxScores

In [24]:
%%time
if prepare_data:
    # load the data
    df_playerBoxScores = unpack_data(df_train, dfs = ['playerBoxScores'])['playerBoxScores']

    # Cria o dataset de jogos
    cols = {
        # columns related to other dimensions
        'gamePk': 'IdGame',
        'gameDate': 'DtGame',
        'gameTimeUTC': 'DtGameUTC',
        'playerId': 'IdPlayer',
        'teamId': 'IdTeam',
        'jerseyNum': 'NuJersey',
        'positionCode': 'CdPosition',
        # suggested column
        'strikeOutsPitching': 'NuStrikeOutsPitching',
    }  
    # numeric columns
    for numeric_col in list(df_playerBoxScores.columns[12:]):
        # skip the columns that contains data about pitching due the amount of Nan values
        if 'Pitching' not in numeric_col:
            cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:] + '_Player'

    df_playerBoxScores = df_playerBoxScores[list(cols)]
    df_playerBoxScores.columns = list(cols.values())

    # df_playerBoxScores['DtGame'] = df_playerBoxScores['DtGame'] + " 00:00:00"
    df_playerBoxScores['DtGame'] = df_playerBoxScores['DtGame'].astype('datetime64[ns]')
    df_playerBoxScores['DtGameUTC'] = df_playerBoxScores['DtGameUTC'].astype('datetime64[ns]')
    df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdPlayer'], 'Player')
    df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdTeam'], 'Team')
    df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdGame'], 'Game')

    # Salva o dataset
    pd.to_pickle(df_playerBoxScores, processed_data_path + 'playerBoxScores.pkl')

    del df_playerBoxScores

Wall time: 5min 57s
Compiler : 1.29 s
Parser   : 647 ms


#### Team Box Scores

In [25]:
%%time
if prepare_data:
    df_tbs = unpack_data(df_train, dfs = ['teamBoxScores'])['teamBoxScores']

    cols = {
        'gameDate': 'DtGame',
        'teamId': 'IdTeam',
        'home': 'FlgHome',
        'gamePk': 'IdGame',
    }

    for numeric_col in list(df_tbs.columns[4:]):
        cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:] + '_Team'

    df_tbs = df_tbs[list(cols)]
    df_tbs.columns = list(cols.values())

    df_tbs['DtGame'] = df_tbs['DtGame'].astype('datetime64[ns]')
    df_tbs = create_id(df_tbs, ['DtGame', 'IdTeam'], 'Team')
    df_tbs = create_id(df_tbs, ['DtGame', 'IdGame'], 'Game')
    df_tbs['FlgHome'] = df_tbs['FlgHome'].astype('bool')

    pd.to_pickle(df_tbs, processed_data_path + 'teamBoxScores.pkl')

    del df_tbs

Wall time: 4.34 s


#### Games

In [26]:
%%time
if prepare_data:
    df_games = unpack_data(df_train, dfs = ['games'])['games']

    cols = {
        'gamePk': 'IdGame',
        'gameType': 'CdGameType',
        'season': 'NuSeason',
        'gameDate': 'DtGame',
        'codedGameState': 'CdGameState',
        'detailedGameState': 'CdGameState',
        'isTie': 'FlgTie',
        'gameNumber': 'NuGame',
        'doubleHeader': 'CdDoubleHeader',
        'dayNight': 'CdDayNight',
        'scheduledInnings': 'NuScheduledInnings',
        'gamesInSeries': 'NuGamesInSeries',
        'homeId': 'IdHomeTeam',
        'homeWins': 'NuWinsHomeTeam',
        'homeLosses': 'NuLossesHomeTeam',
        'homeWinPct': 'NuWinPctHomeTeam',
        'homeWinner': 'FlgWinnerHomeTeam',
        'homeScore': 'NuScoreHomeTeam',
        'awayId': 'IdAwayTeam',
        'awayWins': 'NuWinsAwayTeam',
        'awayLosses': 'NuLossesAwayTeam',
        'awayWinPct': 'NuWinPctAwayTeam',
        'awayWinner': 'FlgWinnerAwayTeam',
        'awayScore': 'NuScoreAwayTeam',
    }

    df_games = df_games[list(cols)]
    df_games.columns = list(cols.values())

    df_games['DtGame'] = df_games['DtGame'].astype('datetime64[ns]')
    df_games = create_id(df_games, ['DtGame', 'IdGame'], 'Game')

    pd.to_pickle(df_games, processed_data_path + 'games.pkl')

    del df_games

Wall time: 4.45 s
Compiler : 370 ms


### Tratamento dos dados

#### Carregando os dados

In [27]:
df_targets = pd.read_pickle(processed_data_path + 'targets.pkl')
df_pbs = pd.read_pickle(processed_data_path + 'playerBoxScores.pkl')
df_tbs = pd.read_pickle(processed_data_path + 'teamBoxScores.pkl')
df_g = pd.read_pickle(processed_data_path + 'games.pkl')

#### Reduzindo memória

In [28]:
# Apenas o player box scores apresenta redução de memória
df_pbs = reduce_mem_usage(df_pbs);

Memory usage of dataframe is 95.55 MB
Memory usage after optimization is: 33.32 MB
Decreased by 65.1%


#### Shifting

In [29]:
%%time
sort_df(df_targets)
df_train = shift_targets(df_targets, shift_vals=[1, 2, 3, 4, 5, 6, 7])

Wall time: 3min 40s


### Feature Engineering

#### Datetime

In [30]:
# Transform the datetime col into new features
df_train['DtYear'] = df_train['Dt'].dt.year
df_train['DtMonth'] = df_train['Dt'].dt.month
df_train['DtDay'] = df_train['Dt'].dt.day
df_train['DtDayOfWeek'] = df_train['Dt'].dt.dayofweek
df_train['DtDayOfYear'] = df_train['Dt'].dt.dayofyear
df_train['DtQuarter'] = df_train['Dt'].dt.quarter
# get the hour and minute from the PBS
df_pbs['DtHour'] = df_pbs['DtGameUTC'].dt.hour
df_pbs['DtMinute'] = df_pbs['DtGameUTC'].dt.minute
df_pbs['DtHour'][df_pbs['DtHour'] == 0] = 24

### Unindo os datasets

#### Player Box Scores

In [31]:
df_train = pd.merge(df_train, df_pbs, on=['IdDtPlayer'], how='left')

# Substitui os valores Nan das seguintes colunas por 0
f = [c for c in df_train.columns if c not in ['IdGame', 'DtGame', 'DtGameUTC',
'IdPlayer_y','IdTeam','NuJersey','CdPosition', 'target1_shift_1', 'target2_shift_1',
'target3_shift_1','target1_shift_2', 'target3_shift_2', 'target4_shift_2',
'target1_shift_3','target2_shift_3','target3_shift_3','target4_shift_3',
'target1_shift_4','target2_shift_4','target3_shift_4','target4_shift_4',
'target1_shift_5','target2_shift_5','target3_shift_5','target4_shift_5',
'target1_shift_6','target2_shift_6','target3_shift_6','target4_shift_6',
'target1_shift_7','target2_shift_7','target3_shift_7','target4_shift_7']]

df_train[f] = df_train[f].fillna(0)        

# Remove os na das seguintes colunas
df_train = df_train.dropna(subset=[             
    'target1_shift_1', 'target2_shift_1', 'target3_shift_1', 'target1_shift_2',
    'target3_shift_2', 'target4_shift_2', 'target1_shift_3', 'target2_shift_3',
    'target3_shift_3', 'target4_shift_3', 'target1_shift_4', 'target2_shift_4',
    'target3_shift_4', 'target4_shift_4', 'target1_shift_5', 'target3_shift_5',
    'target4_shift_5', 'target1_shift_6', 'target2_shift_6', 'target3_shift_6',
    'target4_shift_6', 'target1_shift_7', 'target2_shift_7', 'target3_shift_7',
    'target4_shift_7'])

df_train.rename(columns={'IdPlayer_x': 'IdPlayer'}, inplace=True)

#### Team Box Scores

In [32]:
df_train = pd.merge(df_train, df_tbs, on = ['IdDtTeam'], how = 'left')

# Substitui os valores Nan das seguintes colunas por 0
f = [
    'FlgHome','NuFlyOuts_Team', 'NuGroundOuts_Team', 'NuRunsScored_Team',
    'NuDoubles_Team', 'NuTriples_Team', 'NuHomeRuns_Team', 'NuStrikeOuts_Team',
    'NuBaseOnBalls_Team', 'NuIntentionalWalks_Team', 'NuHits_Team', 'NuHitByPitch_Team',
    'NuAtBats_Team', 'NuCaughtStealing_Team', 'NuStolenBases_Team', 'NuGroundIntoDoublePlay_Team',
    'NuGroundIntoTriplePlay_Team', 'NuPlateAppearances_Team', 'NuTotalBases_Team', 'NuRbi_Team',
    'NuLeftOnBase_Team', 'NuSacBunts_Team', 'NuSacFlies_Team', 'NuCatchersInterference_Team',
    'NuPickoffs_Team', 'NuAirOutsPitching_Team', 'NuGroundOutsPitching_Team', 'NuRunsPitching_Team',
    'NuDoublesPitching_Team', 'NuTriplesPitching_Team', 'NuHomeRunsPitching_Team',
    'NuStrikeOutsPitching_Team', 'NuBaseOnBallsPitching_Team', 'NuIntentionalWalksPitching_Team',
    'NuHitsPitching_Team', 'NuHitByPitchPitching_Team', 'NuAtBatsPitching_Team',
    'NuCaughtStealingPitching_Team', 'NuStolenBasesPitching_Team', 'NuInningsPitched_Team',
    'NuEarnedRuns_Team', 'NuBattersFaced_Team', 'NuOutsPitching_Team', 'NuHitBatsmen_Team',
    'NuBalks_Team', 'NuWildPitches_Team', 'NuPickoffsPitching_Team', 'NuRbiPitching_Team',
    'NuInheritedRunners_Team', 'NuInheritedRunnersScored_Team', 'NuCatchersInterferencePitching_Team',
    'NuSacBuntsPitching_Team', 'NuSacFliesPitching_Team'
]

df_train[f] = df_train[f].fillna(0)        

df_train = df_train.rename(columns={'IdDtGame_y': 'IdDtGame'})

#### Games

In [33]:
df_train = pd.merge(df_train, df_g, on = ['IdDtGame'], how = 'left')

f = [
    'NuSeason', 'NuGame',
    'NuScheduledInnings', 'NuGamesInSeries', 'NuWinsHomeTeam',
    'NuLossesHomeTeam', 'NuWinPctHomeTeam', 'NuScoreHomeTeam',
    'NuWinsAwayTeam', 'NuLossesAwayTeam', 'NuWinPctAwayTeam', 'NuScoreAwayTeam'
]
df_train[f] = df_train[f].fillna(0)        

df_train = pd.get_dummies(df_train, columns = ['CdPosition', "CdGameType", "CdGameState", "CdDoubleHeader", 
                                    "CdDayNight", "FlgWinnerHomeTeam", "FlgWinnerAwayTeam",'FlgTie', 'FlgHome'])

### Drop colunas

In [34]:
# Dropa colunas com vários valores Nan
df_train.drop([
    'IdGame_x', 'DtGame_x', 'DtGameUTC', 'IdPlayer_y',
    'IdTeam_x', 'IdTeam_y', "NuGameTimeUTC_Team", "IdDtGame", "DtGame", "IdGame",
    'IdGame_y', 'NuJersey', "DtGame_y", "IdHomeTeam", "IdAwayTeam", "IdDtPlayer",
    "IdDtTeam", "IdDtGame_x"], axis = 1, inplace = True)

### Salvando datasets

In [35]:
df_train = reduce_mem_usage(df_train);

df_targets.to_pickle(processed_data_path + 'targets.pkl')
df_pbs.to_pickle(processed_data_path + 'playerBoxScores.pkl')
df_tbs.to_pickle(processed_data_path + 'teamBoxScores.pkl')
df_g.to_pickle(processed_data_path + 'games.pkl')

df_train.to_pickle(processed_data_path + 'train.pkl')

Memory usage of dataframe is 2190.92 MB
Memory usage after optimization is: 981.16 MB
Decreased by 55.2%


## Modelos

### Modelagem até a A1

### Multi Output Chaining
[note] adicionar aqui o teste falho de multi output chaining, explicar porque deu errado e mostrar que os resultados de um target não dependem dos demais. Além disso adicionar a correlação.

###### Multi Output Regressor e Modelos Finais