# Data Preparation

## Transform Data

Notebook que irá conter o código para o processamento e transformações de dados. Este notebook deve ser rodado após o notebook `pre-process-data.ipynb`.

os dados "crus" estão localizados em `/src/data/raw-data` e os dados tratados estão salvos em `/src/data/processed-data`.

## Initial Setup

In [1]:
# Imports
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# Global Variables
# file paths
raw_data_path = '../data/raw-data/'
processed_data_path = '../data/processed-data/'


TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

In [54]:
# Loading data
df_targets = pd.read_pickle(processed_data_path + 'targets.pkl')
df_pbs = pd.read_pickle(processed_data_path + 'playerBoxScores.pkl')
df_tbs = pd.read_pickle(processed_data_path + 'teamBoxScores.pkl')
df_g = pd.read_pickle(processed_data_path + 'games.pkl')

## Data Transformations

### Reduce Memory Usage

In [55]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and is_datetime(df[col]) == False and col_type != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif is_datetime(df[col]) == True:
            df[col] = df[col].astype('datetime64[ns]')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#### Df Targets

In [56]:
#df_targets = reduce_mem_usage(df_targets);

#### Player Box Scores

In [57]:
df_pbs = reduce_mem_usage(df_pbs);

Memory usage of dataframe is 36.67 MB
Memory usage after optimization is: 33.74 MB
Decreased by 8.0%


#### Team Box Scores

In [58]:
df_tbs = reduce_mem_usage(df_tbs);

Memory usage of dataframe is 6.72 MB
Memory usage after optimization is: 1.43 MB
Decreased by 78.6%


#### Games

In [8]:
#df_g = reduce_mem_usage(df_g);

### Sorting and Shifting

In [59]:
# Funções auxiliares para o pré-processamento dos dados
def sort_df(df: pd.DataFrame, columns: list = ['IdPlayer', 'Dt']) -> None:
    """Sort the dataframe by the columns passed as argument.
    
    Args:
        df (pd.DataFrame): Dataframe to be sorted.
        columns (list, optional): Columns to sort the dataframe. Defaults to ['IdPlayer', 'Dt'].
        
        Returns:
            None
    """
    df.sort_values(by=columns, inplace=True)
    # reset index
    df.reset_index(drop=True, inplace=True)


def shift_targets(df, shift_vals: list = [1, 2, 3, 4, 5, 6, 7, 14, 30]):
    """Shift the targets by the values passed as argument.

    Args:
        df (pd.DataFrame): Dataframe to be shifted.
        shift_vals (list, optional): Values to shift the targets. Defaults to [1, 2, 3, 4, 5, 6, 7, 14, 30].

    Returns:
        pd.DataFrame: Dataframe with the shifted targets.
    """
    df_aux = pd.DataFrame()
    # Iterate over players to make the shift only using the player data
    for player in df['IdPlayer'].unique():
        df_player = df[df['IdPlayer'] == player]
        # Iterate over the pre-defined shift values
        for shift_val in shift_vals:
            # Iterate over the targets
            for target in TARGET_COLS:
                # Make the shift
                df_player[f'{target}_shift_{shift_val}'] = df_player[target].shift(shift_val)
        # Concatenate the player data with the rest of the data
        df_aux = pd.concat([df_aux, df_player], axis=0)
        # Remove the player data from memory
        del df_player
    # df.dropna(inplace=True)
    return df_aux


def train_test_split(
    df: pd.DataFrame
    ,test_split_date: str = TEST_SPLIT_DATE
    ):
    """Split the dataframe into train and test sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
        test_split_date (str, optional): Date to split the dataframe. Defaults to TEST_SPLIT_DATE.
    """

    train = df[(df.Dt <= "2021-01-31") & (df.Dt >= "2018-01-01")] 
    val = df[(df.Dt <= "2021-04-30") & (df.Dt >= "2021-02-01")] 
    test = df[(df.Dt <= "2021-07-31") & (df.Dt >= "2021-05-01")]
    # train.to_csv('train.csv', index=None)
    # val.to_csv('validation.csv', index=None) 
    # test.to_csv('test.csv', index=None) 

    return train, test, val


def x_y_split(df: pd.DataFrame, target_cols: list = TARGET_COLS):
    """Split the dataframe into x and y sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
    """
    y = df[target_cols]
    x = df.drop(target_cols, axis=1)
    return x, y

In [60]:
%%time
sort_df(df_targets)
df_train = shift_targets(df_targets, shift_vals=[1, 2, 3, 4, 5, 6, 7])

Wall time: 3min 29s


## Feature Engineering

### Datetime

In [61]:
# Transform the datetime col into new features
df_train['DtYear'] = df_train['Dt'].dt.year
df_train['DtMonth'] = df_train['Dt'].dt.month
df_train['DtDay'] = df_train['Dt'].dt.day
df_train['DtDayOfWeek'] = df_train['Dt'].dt.dayofweek
df_train['DtDayOfYear'] = df_train['Dt'].dt.dayofyear
df_train['DtQuarter'] = df_train['Dt'].dt.quarter
# get the hour and minute from the PBS
df_pbs['DtHour'] = df_pbs['DtGameUTC'].dt.hour
df_pbs['DtMinute'] = df_pbs['DtGameUTC'].dt.minute
df_pbs['DtHour'][df_pbs['DtHour'] == 0] = 24

In [62]:
media_por_jogador = df_train.groupby('IdPlayer')[TARGET_COLS].mean()
mediana_por_jogador = df_train.groupby('IdPlayer')[TARGET_COLS].median()

In [63]:
media_por_jogador = media_por_jogador.rename(columns={
    'target1': 'target1_pmean',
    'target2': 'target2_pmean',
    'target3': 'target3_pmean',
    'target4': 'target4_pmean'
})

mediana_por_jogador = mediana_por_jogador.rename(columns={
    'target1': 'target1_pmedian',
    'target2': 'target2_pmedian',
    'target3': 'target3_pmedian',
    'target4': 'target4_pmedian'
})

## Joining Datasets

### Média e mediana por jogador

In [64]:
df_train = pd.merge(df_train, media_por_jogador, on=['IdPlayer'], how='left')
df_train = pd.merge(df_train, mediana_por_jogador, on=['IdPlayer'], how='left')

In [65]:
df_train

Unnamed: 0,Dt,IdPlayer,target1,target2,target3,target4,IdDtPlayer,target1_shift_1,target2_shift_1,target3_shift_1,...,DtDayOfYear,DtQuarter,target1_pmean,target2_pmean,target3_pmean,target4_pmean,target1_pmedian,target2_pmedian,target3_pmedian,target4_pmedian
0,2018-01-01,112526,0.055277,5.496109,0.025839,16.176470,2018-01-01 00:00:00_112526,,,,...,1,1,0.890797,4.631284,2.095763,15.132475,0.035116,1.479535,0.107288,10.270016
1,2018-01-02,112526,0.060625,3.252914,0.030486,8.541353,2018-01-02 00:00:00_112526,0.055277,5.496109,0.025839,...,2,1,0.890797,4.631284,2.095763,15.132475,0.035116,1.479535,0.107288,10.270016
2,2018-01-03,112526,0.029341,1.648352,0.032613,10.490111,2018-01-03 00:00:00_112526,0.060625,3.252914,0.030486,...,3,1,0.890797,4.631284,2.095763,15.132475,0.035116,1.479535,0.107288,10.270016
3,2018-01-04,112526,0.014799,2.665894,0.087422,19.091467,2018-01-04 00:00:00_112526,0.029341,1.648352,0.032613,...,4,1,0.890797,4.631284,2.095763,15.132475,0.035116,1.479535,0.107288,10.270016
4,2018-01-05,112526,0.083916,1.161002,0.024759,6.643879,2018-01-05 00:00:00_112526,0.014799,2.665894,0.087422,...,5,1,0.890797,4.631284,2.095763,15.132475,0.035116,1.479535,0.107288,10.270016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695783,2021-07-27,685503,0.000642,0.182785,0.000482,0.574713,2021-07-27 00:00:00_685503,0.000661,0.109511,0.000882,...,208,3,0.036275,0.730502,0.048631,0.268987,0.000000,0.000000,0.000000,0.027349
2695784,2021-07-28,685503,0.000515,0.144995,0.000681,0.068164,2021-07-28 00:00:00_685503,0.000642,0.182785,0.000482,...,209,3,0.036275,0.730502,0.048631,0.268987,0.000000,0.000000,0.000000,0.027349
2695785,2021-07-29,685503,0.000536,0.139173,0.000389,0.079939,2021-07-29 00:00:00_685503,0.000515,0.144995,0.000681,...,210,3,0.036275,0.730502,0.048631,0.268987,0.000000,0.000000,0.000000,0.027349
2695786,2021-07-30,685503,0.000452,0.193631,0.002074,0.217098,2021-07-30 00:00:00_685503,0.000536,0.139173,0.000389,...,211,3,0.036275,0.730502,0.048631,0.268987,0.000000,0.000000,0.000000,0.027349


### Player Box Scores

In [66]:
df_train = pd.merge(df_train, df_pbs, on=['IdDtPlayer'], how='left')

# Substitui os valores Nan das seguintes colunas por 0
f = [c for c in df_train.columns if c not in ['IdGame', 'DtGame', 'DtGameUTC',
'IdPlayer_y','IdTeam','NuJersey','CdPosition', 'target1_shift_1', 'target2_shift_1',
'target3_shift_1','target1_shift_2', 'target3_shift_2', 'target4_shift_2',
'target1_shift_3','target2_shift_3','target3_shift_3','target4_shift_3',
'target1_shift_4','target2_shift_4','target3_shift_4','target4_shift_4',
'target1_shift_5','target2_shift_5','target3_shift_5','target4_shift_5',
'target1_shift_6','target2_shift_6','target3_shift_6','target4_shift_6',
'target1_shift_7','target2_shift_7','target3_shift_7','target4_shift_7']]

df_train[f] = df_train[f].fillna(0)        

# Remove os na das seguintes colunas
df_train = df_train.dropna(subset=[             
    'target1_shift_1', 'target2_shift_1', 'target3_shift_1', 'target1_shift_2',
    'target3_shift_2', 'target4_shift_2', 'target1_shift_3', 'target2_shift_3',
    'target3_shift_3', 'target4_shift_3', 'target1_shift_4', 'target2_shift_4',
    'target3_shift_4', 'target4_shift_4', 'target1_shift_5', 'target3_shift_5',
    'target4_shift_5', 'target1_shift_6', 'target2_shift_6', 'target3_shift_6',
    'target4_shift_6', 'target1_shift_7', 'target2_shift_7', 'target3_shift_7',
    'target4_shift_7'])

df_train.rename(columns={'IdPlayer_x': 'IdPlayer'}, inplace=True)
df_train.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2684030 entries, 7 to 2698456
Columns: 106 entries, Dt to DtMinute
dtypes: datetime64[ns](3), float16(46), float32(40), float64(6), int64(7), object(4)
memory usage: 1.0+ GB


### Team Box Scores

In [67]:
df_train = pd.merge(df_train, df_tbs, on = ['IdDtTeam'], how = 'left')

In [68]:
# Substitui os valores Nan das seguintes colunas por 0
f = [
    'FlgHome','NuFlyOuts_Team', 'NuGroundOuts_Team', 'NuRunsScored_Team',
    'NuDoubles_Team', 'NuTriples_Team', 'NuHomeRuns_Team', 'NuStrikeOuts_Team',
    'NuBaseOnBalls_Team', 'NuIntentionalWalks_Team', 'NuHits_Team', 'NuHitByPitch_Team',
    'NuAtBats_Team', 'NuCaughtStealing_Team', 'NuStolenBases_Team', 'NuGroundIntoDoublePlay_Team',
    'NuGroundIntoTriplePlay_Team', 'NuPlateAppearances_Team', 'NuTotalBases_Team', 'NuRbi_Team',
    'NuLeftOnBase_Team', 'NuSacBunts_Team', 'NuSacFlies_Team', 'NuCatchersInterference_Team',
    'NuPickoffs_Team', 'NuAirOutsPitching_Team', 'NuGroundOutsPitching_Team', 'NuRunsPitching_Team',
    'NuDoublesPitching_Team', 'NuTriplesPitching_Team', 'NuHomeRunsPitching_Team',
    'NuStrikeOutsPitching_Team', 'NuBaseOnBallsPitching_Team', 'NuIntentionalWalksPitching_Team',
    'NuHitsPitching_Team', 'NuHitByPitchPitching_Team', 'NuAtBatsPitching_Team',
    'NuCaughtStealingPitching_Team', 'NuStolenBasesPitching_Team', 'NuInningsPitched_Team',
    'NuEarnedRuns_Team', 'NuBattersFaced_Team', 'NuOutsPitching_Team', 'NuHitBatsmen_Team',
    'NuBalks_Team', 'NuWildPitches_Team', 'NuPickoffsPitching_Team', 'NuRbiPitching_Team',
    'NuInheritedRunners_Team', 'NuInheritedRunnersScored_Team', 'NuCatchersInterferencePitching_Team',
    'NuSacBuntsPitching_Team', 'NuSacFliesPitching_Team'
]

df_train[f] = df_train[f].fillna(0)        

In [69]:
df_train = df_train.rename(columns={'IdDtGame_y': 'IdDtGame'})

#### Games

In [70]:
df_train = pd.merge(df_train, df_g, on = ['IdDtGame'], how = 'left')

In [71]:
f = [
    'NuSeason', 'NuGame',
    'NuScheduledInnings', 'NuGamesInSeries', 'NuWinsHomeTeam',
    'NuLossesHomeTeam', 'NuWinPctHomeTeam', 'NuScoreHomeTeam',
    'NuWinsAwayTeam', 'NuLossesAwayTeam', 'NuWinPctAwayTeam', 'NuScoreAwayTeam'
]
df_train[f] = df_train[f].fillna(0)        

In [72]:
df_train.info(null_counts=True, max_cols = 150)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2693260 entries, 0 to 2693259
Columns: 188 entries, Dt to NuScoreAwayTeam
dtypes: datetime64[ns](5), float16(48), float32(40), float64(75), int64(7), object(13)
memory usage: 2.7+ GB


In [73]:
df_train = pd.get_dummies(df_train, columns = ['CdPosition', "CdGameType", "CdGameState", "CdDoubleHeader", 
                                    "CdDayNight", "FlgWinnerHomeTeam", "FlgWinnerAwayTeam",'FlgTie'])

In [74]:
# Dropa colunas com vários valores Nan
df_train.drop([
    'IdGame_x', 'DtGame_x', 'DtGameUTC', 'IdPlayer_y',
    'IdTeam_x', 'IdTeam_y', "NuGameTimeUTC_Team", "IdDtGame", "DtGame", "IdGame",
    'IdGame_y', 'NuJersey', "DtGame_y", "IdHomeTeam", "IdAwayTeam"], axis = 1, inplace = True)

### Reduce memory usage

In [80]:
df_train_reduced = reduce_mem_usage(df_train[df_train.columns.difference(TARGET_COLS)]);
df_train_reduced[TARGET_COLS] = df_train[TARGET_COLS]
df_train = df_train_reduced.copy()
del df_train_reduced

Memory usage of dataframe is 2278.25 MB
Memory usage after optimization is: 1058.22 MB
Decreased by 53.6%


  df_train_reduced[TARGET_COLS] = df_train[TARGET_COLS]
  df_train_reduced[TARGET_COLS] = df_train[TARGET_COLS]
  df_train_reduced[TARGET_COLS] = df_train[TARGET_COLS]
  df_train_reduced[TARGET_COLS] = df_train[TARGET_COLS]


In [81]:
df_train.info(max_cols = 200)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2693260 entries, 0 to 2693259
Data columns (total 195 columns):
 #    Column                               Dtype         
---   ------                               -----         
 0    CdDayNight_day                       float16       
 1    CdDayNight_night                     float16       
 2    CdDoubleHeader_N                     float16       
 3    CdDoubleHeader_S                     float16       
 4    CdDoubleHeader_Y                     float16       
 5    CdGameState_Completed Early          float16       
 6    CdGameState_F                        float16       
 7    CdGameState_Final                    float16       
 8    CdGameType_A                         float16       
 9    CdGameType_D                         float16       
 10   CdGameType_F                         float16       
 11   CdGameType_L                         float16       
 12   CdGameType_R                         float16       
 13   CdGameType

## Save Prepared Datasets

In [82]:
df_targets.to_pickle(processed_data_path + 'targets.pkl')
df_pbs.to_pickle(processed_data_path + 'playerBoxScores.pkl')
df_tbs.to_pickle(processed_data_path + 'teamBoxScores.pkl')
df_g.to_pickle(processed_data_path + 'games.pkl')

df_train.to_pickle(processed_data_path + 'train.pkl')

In [83]:
pd.read_pickle(processed_data_path + 'train.pkl').info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2693260 entries, 0 to 2693259
Columns: 195 entries, CdDayNight_day to target4
dtypes: datetime64[ns](1), float16(180), float32(4), int16(2), int32(1), int8(4), object(3)
memory usage: 1.1+ GB
