# Data Preparation

## Transform Data

Notebook que irá conter o código para o processamento e transformações de dados. Este notebook deve ser rodado após o notebook `pre-process-data.ipynb`.

os dados "crus" estão localizados em `/src/data/raw-data` e os dados tratados estão salvos em `/src/data/processed-data`.

## Initial Setup

In [1]:
# Imports
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed

In [2]:
# Global Variables
# file paths
raw_data_path = '../data/raw-data/'
processed_data_path = '../data/processed-data/'

In [3]:
# Loading data
df_targets = pd.read_pickle(processed_data_path + 'targets.pkl')
df_pbs = pd.read_pickle(processed_data_path + 'playerBoxScores.pkl')

## Data Transformations

### Reduce Memory Usage

In [4]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and is_datetime(df[col]) == False and col_type != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif is_datetime(df[col]) == True:
            df[col] = df[col].astype('datetime64[ns]')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#### Df Targets

In [5]:
df_targets = reduce_mem_usage(df_targets);

Memory usage of dataframe is 71.99 MB
Memory usage after optimization is: 71.99 MB
Decreased by 0.0%


In [6]:
df_targets.info(show_counts=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2695788 entries, 0 to 2695787
Data columns (total 7 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Dt          datetime64[ns]
 1   IdPlayer    int32         
 2   target1     float16       
 3   target2     float16       
 4   target3     float16       
 5   target4     float16       
 6   IdDtPlayer  object        
dtypes: datetime64[ns](1), float16(4), int32(1), object(1)
memory usage: 72.0+ MB


#### Player Box Scores

In [7]:
df_pbs = reduce_mem_usage(df_pbs);

Memory usage of dataframe is 29.97 MB
Memory usage after optimization is: 29.97 MB
Decreased by 0.0%


In [8]:
df_pbs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219727 entries, 0 to 451
Data columns (total 54 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   IdGame                    219727 non-null  int32         
 1   DtGame                    219727 non-null  datetime64[ns]
 2   DtGameUTC                 219727 non-null  object        
 3   IdPlayer                  219727 non-null  int32         
 4   IdTeam                    219727 non-null  int16         
 5   NuJersey                  219690 non-null  object        
 6   CdPosition                219727 non-null  int8          
 7   NuStrikeOutsPitching      65466 non-null   float16       
 8   NuBattingOrder            183390 non-null  float16       
 9   NuGamesPlayedBatting      183395 non-null  float16       
 10  NuFlyOuts                 183395 non-null  float16       
 11  NuGroundOuts              183395 non-null  float16       
 12  NuRun

## Save Prepared Datasets

In [9]:
df_targets.to_pickle(processed_data_path + 'targets.pkl')
df_pbs.to_pickle(processed_data_path + 'playerBoxScores.pkl')