> Fundação Getúlio Vargas - RJ <br>
> Escola de Matemática Aplicada (EMAp) <br>
> Graduação em Ciência de Dados e Inteligência Artificial <br>
> Alunos: Gianlucca Devigili e Maisa O. Fraiz <br>
# Projetos em Ciência de Dados - A1

## Instruções de execução

__(#1)__ De modo a tornar mais rápida a carga dos dados, o projeto utiliza uma cópia dos arquivos em formato pickle (`.pkl`). Para executar o projeto utilizando o o dataset em formato `.csv`, atribua o valor `True` para a variável `V_load_from_csv`

**(#2)** Redefina a variável `raw_data_path` para o caminho até o arquivo `train_updated.csv`. Substitua o nome do arquivo caso necessário.

**(#3)** Atribua o valor `True` para a variável `save_files` caso ainda não tenha os arquivos `.pkl` dos datasets auxiliares salvos. (Necessário apenas para a primeira execução da sessão de preparação de dados).

In [1]:
# Variáveis de configuração
# (#1) Variável que define se o dataset será carregado de um .csv ou de um .pkl
load_from_csv = False

# Caminho para o dataset
raw_data_path = '../data/raw-data/'
dataset_path = raw_data_path + 'train_updated.csv'

# Caminho onde serão salvos os dados processados
processed_data_path = '../data/processed-data/'

save_files = False

## 0. Setup Inicial

In [19]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed


# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# variáveis globais

PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']
TEST_SPLIT_DATE = '2021-04-30'

## Preparação dos Dados

### Funções Auxiliares

Algumas funções utilizadas para a preparação dos dados

In [4]:
# Funções auxiliares para carregar os dados
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)

def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

def create_id(df, id_cols, id_col_name, dt_col_name = 'Dt'):
    df['Id' + dt_col_name + id_col_name] = df[id_cols].apply(lambda x: '_'.join(x.astype(str)), axis=1)
    return df

### Carregando os dados

In [5]:
%%time
# (#1)
if load_from_csv:
    df_train = pd.read_csv(dataset_path)
else: 
    dataset_path = raw_data_path + 'train.pkl'
    df_train = pd.read_pickle(dataset_path)

CPU times: user 940 ms, sys: 2.83 s, total: 3.77 s
Wall time: 6.15 s


In [6]:
df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308 entries, 0 to 1307
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   date                     1308 non-null   int64 
 1   nextDayPlayerEngagement  1308 non-null   object
 2   games                    729 non-null    object
 3   rosters                  1307 non-null   object
 4   playerBoxScores          627 non-null    object
 5   teamBoxScores            627 non-null    object
 6   transactions             1194 non-null   object
 7   standings                623 non-null    object
 8   awards                   309 non-null    object
 9   events                   624 non-null    object
 10  playerTwitterFollowers   43 non-null     object
 11  teamTwitterFollowers     43 non-null     object
dtypes: int64(1), object(11)
memory usage: 122.8+ KB


Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20180101,"[{""engagementMetricsDate"":""2018-01-02"",""player...",,"[{""playerId"":400121,""gameDate"":""2018-01-01"",""t...",,,"[{""transactionId"":340732,""playerId"":547348,""pl...",,,,"[{""date"":""2018-01-01"",""playerId"":545361,""playe...","[{""date"":""2018-01-01"",""teamId"":147,""teamName"":..."
1,20180102,"[{""engagementMetricsDate"":""2018-01-03"",""player...",,"[{""playerId"":134181,""gameDate"":""2018-01-02"",""t...",,,"[{""transactionId"":339458,""playerId"":621173,""pl...",,,,,
2,20180103,"[{""engagementMetricsDate"":""2018-01-04"",""player...",,"[{""playerId"":425492,""gameDate"":""2018-01-03"",""t...",,,"[{""transactionId"":347527,""playerId"":572389,""pl...",,,,,
3,20180104,"[{""engagementMetricsDate"":""2018-01-05"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-04"",""t...",,,"[{""transactionId"":339549,""playerId"":545343,""pl...",,,,,
4,20180105,"[{""engagementMetricsDate"":""2018-01-06"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-05"",""t...",,,"[{""transactionId"":341195,""playerId"":628336,""pl...",,,,,


#### Cria o Dataframe Targets

Dataframe contendo as 4 variáveis _target_ bem como suas respectivas chaves `IdPlayer` e `Dt`.

In [7]:
%%time 
# criação do dataset de targets
# unpack the data
Y = unpack_data(df_train, dfs = ['nextDayPlayerEngagement'])['nextDayPlayerEngagement']

# change datatypes
Y = Y.astype({name: np.float32 for name in ["target1", "target2", "target3", "target4"]})

# match target dates to feature dates and create date index
Y = Y.rename(columns={'engagementMetricsDate': 'date'})

# change datatypes
Y['date'] = pd.to_datetime(Y['date'])

# reset index
Y = Y.set_index('date').to_period('D')
Y.index = Y.index - 1
Y = Y.reset_index()

# rename and select columns
cols_Y = {
    'date': 'Dt',
    'playerId': 'IdPlayer',
    'target1': 'target1',
    'target2': 'target2',
    'target3': 'target3',
    'target4': 'target4'
}
Y = Y[list(cols_Y)]
Y.columns = list(cols_Y.values())
Y['Dt'] = Y['Dt'].astype('datetime64[ns]')
Y = create_id(Y, ['Dt', 'IdPlayer'], 'Player')

CPU times: user 2min 26s, sys: 1.86 s, total: 2min 28s
Wall time: 2min 32s


In [8]:
Y.info()
Y.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2695788 entries, 0 to 2695787
Data columns (total 7 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Dt          datetime64[ns]
 1   IdPlayer    int64         
 2   target1     float32       
 3   target2     float32       
 4   target3     float32       
 5   target4     float32       
 6   IdDtPlayer  object        
dtypes: datetime64[ns](1), float32(4), int64(1), object(1)
memory usage: 102.8+ MB


Unnamed: 0,Dt,IdPlayer,target1,target2,target3,target4,IdDtPlayer
0,2018-01-01,628317,0.011167,4.474708,0.005168,5.735294,2018-01-01 00:00:00_628317
1,2018-01-01,547989,0.042993,5.593385,0.045033,2.794118,2018-01-01 00:00:00_547989
2,2018-01-01,519317,0.974327,56.177044,13.693746,64.166664,2018-01-01 00:00:00_519317
3,2018-01-01,607625,0.0067,2.675097,0.005168,1.862745,2018-01-01 00:00:00_607625
4,2018-01-01,592547,0.001117,0.632296,0.002953,0.931373,2018-01-01 00:00:00_592547


In [9]:
if save_files:
    pd.to_pickle(Y, processed_data_path + 'targets.pkl')
del Y

#### Cria o Dataframe Player Box Scores

In [10]:
%%time
df_playerBoxScores = unpack_data(df_train, dfs = ['playerBoxScores'])['playerBoxScores']

CPU times: user 2 s, sys: 206 ms, total: 2.2 s
Wall time: 6.42 s


In [11]:
df_playerBoxScores.info()
df_playerBoxScores.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219727 entries, 0 to 451
Data columns (total 85 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   home                          219727 non-null  int64  
 1   gamePk                        219727 non-null  int64  
 2   gameDate                      219727 non-null  object 
 3   gameTimeUTC                   219727 non-null  object 
 4   teamId                        219727 non-null  int64  
 5   teamName                      219727 non-null  object 
 6   playerId                      219727 non-null  int64  
 7   playerName                    219727 non-null  object 
 8   jerseyNum                     219690 non-null  object 
 9   positionCode                  219727 non-null  int64  
 10  positionName                  219727 non-null  object 
 11  positionType                  219727 non-null  object 
 12  battingOrder                  183390 non-null  

Unnamed: 0,home,gamePk,gameDate,gameTimeUTC,teamId,teamName,playerId,playerName,jerseyNum,positionCode,...,catchersInterferencePitching,sacBuntsPitching,sacFliesPitching,saves,holds,blownSaves,assists,putOuts,errors,chances
0,1,529418,2018-03-29,2018-03-29T23:08:00Z,119,Los Angeles Dodgers,605131,Austin Barnes,15,12,...,,,,,,,,,,
1,1,529406,2018-03-29,2018-03-29T20:00:00Z,139,Tampa Bay Rays,605480,Mallex Smith,0,7,...,,,,,,,0.0,0.0,0.0,0.0
2,0,529416,2018-03-29,2018-03-29T20:10:00Z,143,Philadelphia Phillies,546318,Odubel Herrera,37,8,...,,,,,,,0.0,0.0,0.0,0.0
3,0,529412,2018-03-29,2018-03-29T20:05:00Z,108,Los Angeles Angels,527043,Jefry Marte,19,3,...,,,,,,,0.0,1.0,0.0,1.0
4,1,529408,2018-03-29,2018-03-29T20:15:00Z,118,Kansas City Royals,449181,Paulo Orlando,16,8,...,,,,,,,0.0,2.0,0.0,2.0


In [12]:
cols = {
    # columns related to other dimensions
    'gamePk': 'IdGame',
    'gameDate': 'DtGame',
    'gameTimeUTC': 'DtGameUTC',
    'playerId': 'IdPlayer',
    'teamId': 'IdTeam',
    'jerseyNum': 'NuJersey',
    'positionCode': 'CdPosition',
    # suggested column
    'strikeOutsPitching': 'NuStrikeOutsPitching',
}  
# numeric columns
for numeric_col in list(df_playerBoxScores.columns[12:]):
    # skip the columns that contains data about pitching due the amount of Nan values
    if 'Pitching' not in numeric_col:
        cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:]

df_playerBoxScores['gameDate'] = df_playerBoxScores['gameDate'] + " 00:00:00"

df_playerBoxScores = df_playerBoxScores[list(cols)]
df_playerBoxScores.columns = list(cols.values())
df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdPlayer'], 'Player')

In [13]:
if save_files:
    pd.to_pickle(df_playerBoxScores, processed_data_path + 'playerBoxScores.pkl')
del df_playerBoxScores

In [14]:
del df_train

## Preparação dos dados

In [15]:
df = pd.read_pickle(processed_data_path + 'targets.pkl')

### Funções Auxiliares

In [26]:
def sort_df(df: pd.DataFrame, columns: list = ['IdPlayer', 'Dt']) -> None:
    """Sort the dataframe by the columns passed as argument.
    
    Args:
        df (pd.DataFrame): Dataframe to be sorted.
        columns (list, optional): Columns to sort the dataframe. Defaults to ['IdPlayer', 'Dt'].
        
        Returns:
            None
    """
    df.sort_values(by=columns, inplace=True)
    # reset index
    df.reset_index(drop=True, inplace=True)


def shift_targets(df, shift_vals: list = [1, 2, 3, 4, 5, 6, 7, 14, 30]):
    """Shift the targets by the values passed as argument.

    Args:
        df (pd.DataFrame): Dataframe to be shifted.
        shift_vals (list, optional): Values to shift the targets. Defaults to [1, 2, 3, 4, 5, 6, 7, 14, 30].

    Returns:
        pd.DataFrame: Dataframe with the shifted targets.
    """
    df_aux = pd.DataFrame()
    # Iterate over players to make the shift only using the player data
    for player in df['IdPlayer'].unique():
        df_player = df[df['IdPlayer'] == player]
        # Iterate over the pre-defined shift values
        for shift_val in shift_vals:
            # Iterate over the targets
            for target in TARGET_COLS:
                # Make the shift
                df_player[f'{target}_shift_{shift_val}'] = df_player[target].shift(shift_val)
        # Concatenate the player data with the rest of the data
        df_aux = pd.concat([df_aux, df_player], axis=0)
        # Remove the player data from memory
        del df_player
    # df.dropna(inplace=True)
    return df_aux


def train_test_split(
    df: pd.DataFrame
    ,test_split_date: str = TEST_SPLIT_DATE
    ):
    """Split the dataframe into train and test sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
        test_split_date (str, optional): Date to split the dataframe. Defaults to TEST_SPLIT_DATE.
    """

    train = df[df['Dt'] <= test_split_date]
    test = df[df['Dt'] > test_split_date]

    return train, test


def x_y_split(df: pd.DataFrame, target_cols: list = TARGET_COLS):
    """Split the dataframe into x and y sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
    """
    y = df[target_cols]
    x = df.drop(target_cols, axis=1)
    return x, y

### Ordenação dos valores

Os dados serão ordenados por jogador e então por data, para facilitar a criação das variáveis de _shift_.

In [23]:
sort_df(df)
df.head()

Unnamed: 0,Dt,IdPlayer,target1,target2,target3,target4
0,2018-01-01,112526,0.055277,5.496109,0.025839,16.17647
1,2018-01-02,112526,0.060625,3.252914,0.030486,8.541353
2,2018-01-03,112526,0.029341,1.648352,0.032613,10.490111
3,2018-01-04,112526,0.014799,2.665894,0.087422,19.091467
4,2018-01-05,112526,0.083916,1.161002,0.024759,6.643879


### Shifts

In [27]:
df = shift_targets(df, shift_vals=[1, 2, 3, 4, 5, 6, 7])
df.head()