# Data Preparation

## Pre Processing Datasets

Notebook que irá conter o código para o pré-processamento dos dados, nenhum tratamento/higienização é realizado neste notebook, apenas a extração dos dados e a transformação dos dataset em `.pkl` serão realizadas aqui.

os dados "crus" estão localizados em `/src/data/raw-data` e os dados tratados estão salvos em `/src/data/processed-data`.

## Initial Setup

In [None]:
# Imports
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

In [1]:
# Global Variables
# file paths
raw_data_path = '../data/raw-data/'
processed_data_path = '../data/processed-data/'

# dataframes
dataset_names = {
    'Awards': 'awards.csv', 
    'Example': 'example_test.csv', 
    'Players': 'players.csv',
    'Seasons': 'seasons.csv', 
    'Teams': 'teams.csv', 
    'Train': 'train_updated.csv'
}
for key in dataset_names:
  dataset_names[key] = raw_data_path + dataset_names[key]
dataset_names

## Auxiliary Functions

In [None]:
# Funções auxiliares para carregar os dados
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)

def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

def create_id(df, id_cols, id_col_name, dt_col_name = 'Dt'):
    df['Id' + dt_col_name + id_col_name] = df[id_cols].apply(lambda x: '_'.join(x.astype(str)), axis=1)
    return df

## Loading the Train Updated dataset

In [None]:
%%time
df_train = pd.read_csv(dataset_names['Train'])

In [None]:
pd.to_pickle(df_train, raw_data_path + 'train.pkl')

### Targets

In [None]:
%%time
# criação do dataset de targets
# unpack the data
Y = unpack_data(df_train, dfs = ['nextDayPlayerEngagement'])['nextDayPlayerEngagement']

# change datatypes
Y = Y.astype({name: np.float32 for name in ["target1", "target2", "target3", "target4"]})

# match target dates to feature dates and create date index
Y = Y.rename(columns={'engagementMetricsDate': 'date'})

# change datatypes
Y['date'] = pd.to_datetime(Y['date'])

# reset index
Y = Y.set_index('date').to_period('D')
Y.index = Y.index - 1
Y = Y.reset_index()

# rename and select columns
cols_Y = {
    'date': 'Dt',
    'playerId': 'IdPlayer',
    'target1': 'target1',
    'target2': 'target2',
    'target3': 'target3',
    'target4': 'target4'
}
Y = Y[list(cols_Y)]
Y.columns = list(cols_Y.values())
Y['Dt'] = Y['Dt'].astype('datetime64[ns]')
Y = create_id(Y, ['Dt', 'IdPlayer'], 'Player')

if save_files:
    pd.to_pickle(Y, processed_data_path + 'targets.pkl')

del Y

### Player Box Scores

In [None]:
# load the data
df_playerBoxScores = unpack_data(df_train, dfs = ['playerBoxScores'])['playerBoxScores']

# Cria o dataset de jogos
cols = {
    # columns related to other dimensions
    'gamePk': 'IdGame',
    'gameDate': 'DtGame',
    'gameTimeUTC': 'DtGameUTC',
    'playerId': 'IdPlayer',
    'teamId': 'IdTeam',
    'jerseyNum': 'NuJersey',
    'positionCode': 'CdPosition',
    # suggested column
    'strikeOutsPitching': 'NuStrikeOutsPitching',
}  
# numeric columns
for numeric_col in list(df_playerBoxScores.columns[12:]):
    # skip the columns that contains data about pitching due the amount of Nan values
    if 'Pitching' not in numeric_col:
        cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:]

df_playerBoxScores['DtGame'] = df_playerBoxScores['DtGame'] + " 00:00:00"

df_playerBoxScores = df_playerBoxScores[list(cols)]
df_playerBoxScores.columns = list(cols.values())
df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdPlayer'], 'Player')

# Salva o dataset
if save_files:
    pd.to_pickle(df_playerBoxScores, processed_data_path + 'playerBoxScores.pkl')
del df_playerBoxScores

## Loading the Other datasets