# Data Preparation

## Pre Processing Datasets

Notebook que irá conter o código para o pré-processamento dos dados, nenhum tratamento/higienização é realizado neste notebook, apenas a extração dos dados e a transformação dos dataset em `.pkl` serão realizadas aqui.

os dados "crus" estão localizados em `/src/data/raw-data` e os dados tratados estão salvos em `/src/data/processed-data`.

## Initial Setup

In [1]:
# Imports
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

import warnings
warnings.filterwarnings('ignore')


In [2]:
# Global Variables
# file paths
raw_data_path = '../data/raw-data/'
processed_data_path = '../data/processed-data/'

# dataframes
dataset_names = {
    'Awards': 'awards.csv', 
    'Example': 'example_test.csv', 
    'Players': 'players.csv',
    'Seasons': 'seasons.csv', 
    'Teams': 'teams.csv', 
    'Train': 'train_updated.csv'
}
for key in dataset_names:
  dataset_names[key] = raw_data_path + dataset_names[key]
dataset_names

{'Awards': '../data/raw-data/awards.csv',
 'Example': '../data/raw-data/example_test.csv',
 'Players': '../data/raw-data/players.csv',
 'Seasons': '../data/raw-data/seasons.csv',
 'Teams': '../data/raw-data/teams.csv',
 'Train': '../data/raw-data/train_updated.csv'}

## Auxiliary Functions

In [3]:
# Funções auxiliares para carregar os dados
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)

def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

def create_id(df, id_cols, id_col_name, dt_col_name = 'Dt'):
    df['Id' + dt_col_name + id_col_name] = df[id_cols].apply(lambda x: '_'.join(x.astype(str)), axis=1)
    return df

## Loading the Train Updated dataset

In [4]:
%%time
df_train = pd.read_csv(dataset_names['Train'])

CPU times: total: 37.5 s
Wall time: 1min 22s


In [5]:
pd.to_pickle(df_train, raw_data_path + 'train.pkl')

### Targets

In [6]:
%%time
# criação do dataset de targets
# unpack the data
Y = unpack_data(df_train, dfs = ['nextDayPlayerEngagement'])['nextDayPlayerEngagement']

# change datatypes
Y = Y.astype({name: np.float32 for name in ["target1", "target2", "target3", "target4"]})

# match target dates to feature dates and create date index
Y = Y.rename(columns={'engagementMetricsDate': 'date'})

# change datatypes
Y['date'] = pd.to_datetime(Y['date'])

# reset index
Y = Y.set_index('date').to_period('D')
Y.index = Y.index - 1
Y = Y.reset_index()

# rename and select columns
cols_Y = {
    'date': 'Dt',
    'playerId': 'IdPlayer',
    'target1': 'target1',
    'target2': 'target2',
    'target3': 'target3',
    'target4': 'target4'
}
Y = Y[list(cols_Y)]
Y.columns = list(cols_Y.values())
Y['Dt'] = Y['Dt'].astype('datetime64[ns]')
Y = create_id(Y, ['Dt', 'IdPlayer'], 'Player')

pd.to_pickle(Y, processed_data_path + 'targets.pkl')

del Y

CPU times: total: 2min 26s
Wall time: 3min 31s


### Player Box Scores

In [7]:
%%time
# load the data
df_playerBoxScores = unpack_data(df_train, dfs = ['playerBoxScores'])['playerBoxScores']

# Cria o dataset de jogos
cols = {
    # columns related to other dimensions
    'gamePk': 'IdGame',
    'gameDate': 'DtGame',
    'gameTimeUTC': 'DtGameUTC',
    'playerId': 'IdPlayer',
    'teamId': 'IdTeam',
    'jerseyNum': 'NuJersey',
    'positionCode': 'CdPosition',
    # suggested column
    'strikeOutsPitching': 'NuStrikeOutsPitching',
}  
# numeric columns
for numeric_col in list(df_playerBoxScores.columns[12:]):
    # skip the columns that contains data about pitching due the amount of Nan values
    if 'Pitching' not in numeric_col:
        cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:] + '_Player'

df_playerBoxScores = df_playerBoxScores[list(cols)]
df_playerBoxScores.columns = list(cols.values())

# df_playerBoxScores['DtGame'] = df_playerBoxScores['DtGame'] + " 00:00:00"
df_playerBoxScores['DtGame'] = df_playerBoxScores['DtGame'].astype('datetime64[ns]')
df_playerBoxScores['DtGameUTC'] = df_playerBoxScores['DtGameUTC'].astype('datetime64[ns]')
df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdPlayer'], 'Player')
df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdTeam'], 'Team')
df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdGame'], 'Game')

# Salva o dataset
pd.to_pickle(df_playerBoxScores, processed_data_path + 'playerBoxScores.pkl')

del df_playerBoxScores

CPU times: total: 34.8 s
Wall time: 59.6 s


### Team Box Scores

In [8]:
%%time
df_tbs = unpack_data(df_train, dfs = ['teamBoxScores'])['teamBoxScores']

cols = {
    'gameDate': 'DtGame',
    'teamId': 'IdTeam',
    'home': 'FlgHome',
    'gamePk': 'IdGame',
}

for numeric_col in list(df_tbs.columns[4:]):
    cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:] + '_Team'

df_tbs = df_tbs[list(cols)]
df_tbs.columns = list(cols.values())

df_tbs['DtGame'] = df_tbs['DtGame'].astype('datetime64[ns]')
df_tbs = create_id(df_tbs, ['DtGame', 'IdTeam'], 'Team')
df_tbs = create_id(df_tbs, ['DtGame', 'IdGame'], 'Game')
df_tbs['FlgHome'] = df_tbs['FlgHome'].astype('bool')

pd.to_pickle(df_tbs, processed_data_path + 'teamBoxScores.pkl')

del df_tbs

CPU times: total: 2.03 s
Wall time: 4.26 s


### Games

In [9]:
%%time
df_games = unpack_data(df_train, dfs = ['games'])['games']

CPU times: total: 1.58 s
Wall time: 3.48 s


In [11]:
%%time

cols = {
    'gamePk': 'IdGame',
    'gameType': 'CdGameType',
    'season': 'NuSeason',
    'gameDate': 'DtGame',
    'codedGameState': 'CdGameState',
    'detailedGameState': 'CdGameState',
    'isTie': 'FlgTie',
    'gameNumber': 'NuGame',
    'doubleHeader': 'CdDoubleHeader',
    'dayNight': 'CdDayNight',
    'scheduledInnings': 'NuScheduledInnings',
    'gamesInSeries': 'NuGamesInSeries',
    'homeId': 'IdHomeTeam',
    'homeWins': 'NuWinsHomeTeam',
    'homeLosses': 'NuLossesHomeTeam',
    'homeWinPct': 'NuWinPctHomeTeam',
    'homeWinner': 'FlgWinnerHomeTeam',
    'homeScore': 'NuScoreHomeTeam',
    'awayId': 'IdAwayTeam',
    'awayWins': 'NuWinsAwayTeam',
    'awayLosses': 'NuLossesAwayTeam',
    'awayWinPct': 'NuWinPctAwayTeam',
    'awayWinner': 'FlgWinnerAwayTeam',
    'awayScore': 'NuScoreAwayTeam',
}

df_games = df_games[list(cols)]
df_games.columns = list(cols.values())

df_games['DtGame'] = df_games['DtGame'].astype('datetime64[ns]')
df_games = create_id(df_games, ['DtGame', 'IdGame'], 'Game')

pd.to_pickle(df_games, processed_data_path + 'games.pkl')

del df_games

## Loading the Other datasets