# Data Preparation

Notebook que irá conter o código utilizado para a preparação dos dados do projeto.

os dados "crus" estão localizados em **/src/data/raw-data** e os dados tratados devem ser salvos em **/src/data/processed-data** 

In [2]:
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed
from pathlib import Path

In [3]:
# File paths for working locally
raw_data_path = '../data/raw-data/'
processed_data_path = '../data/processed-data/'

### Auxiliary functions

These functions were taken from the getting started notebook from kaggle

In [4]:
# Helper function to unpack json found in daily data
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)


def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

## Loading the Datasets

In [5]:
dataset_names = {
    'Awards': 'awards.csv', 
    'Example': 'example_test.csv', 
    'Players': 'players.csv',
    'Seasons': 'seasons.csv', 
    'Teams': 'teams.csv', 
    'Train': 'train.csv'
}
for key in dataset_names:
  dataset_names[key] = raw_data_path + dataset_names[key]
dataset_names

{'Awards': '../data/raw-data/awards.csv',
 'Example': '../data/raw-data/example_test.csv',
 'Players': '../data/raw-data/players.csv',
 'Seasons': '../data/raw-data/seasons.csv',
 'Teams': '../data/raw-data/teams.csv',
 'Train': '../data/raw-data/train.csv'}

In [6]:
df_test = pd.read_csv(dataset_names['Example'])

In [7]:
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date                    5 non-null      int64  
 1   games                   5 non-null      object 
 2   rosters                 5 non-null      object 
 3   playerBoxScores         5 non-null      object 
 4   teamBoxScores           5 non-null      object 
 5   transactions            4 non-null      object 
 6   standings               5 non-null      object 
 7   awards                  1 non-null      object 
 8   events                  5 non-null      object 
 9   playerTwitterFollowers  0 non-null      float64
 10  teamTwitterFollowers    0 non-null      float64
dtypes: float64(2), int64(1), object(8)
memory usage: 568.0+ bytes


Unnamed: 0,date,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20210426,"[{""gamePk"":634374,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-26"",""t...","[{""home"":1,""gamePk"":634377,""gameDate"":""2021-04...","[{""home"":1,""teamId"":139,""gamePk"":634343,""gameD...","[{""transactionId"":480386,""playerId"":543685,""pl...","[{""season"":2021,""gameDate"":""2021-04-26"",""divis...",,"[{""gamePk"":634433,""gameDate"":""2021-04-26"",""gam...",,
1,20210427,"[{""gamePk"":634318,""gameType"":""R"",""season"":2021...","[{""playerId"":443558,""gameDate"":""2021-04-27"",""t...","[{""home"":1,""gamePk"":634320,""gameDate"":""2021-04...","[{""home"":1,""teamId"":117,""gamePk"":634333,""gameD...","[{""transactionId"":480456,""playerId"":642162,""pl...","[{""season"":2021,""gameDate"":""2021-04-27"",""divis...",,"[{""gamePk"":634332,""gameDate"":""2021-04-27"",""gam...",,
2,20210428,"[{""gamePk"":634309,""gameType"":""R"",""season"":2021...","[{""playerId"":429722,""gameDate"":""2021-04-28"",""t...","[{""home"":1,""gamePk"":634310,""gameDate"":""2021-04...","[{""home"":0,""teamId"":111,""gamePk"":634310,""gameD...","[{""transactionId"":480728,""playerId"":545358,""pl...","[{""season"":2021,""gameDate"":""2021-04-28"",""divis...",,"[{""gamePk"":634317,""gameDate"":""2021-04-28"",""gam...",,
3,20210429,"[{""gamePk"":634330,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-29"",""t...","[{""home"":1,""gamePk"":634330,""gameDate"":""2021-04...","[{""home"":0,""teamId"":119,""gamePk"":634346,""gameD...","[{""transactionId"":480993,""playerId"":606965,""pl...","[{""season"":2021,""gameDate"":""2021-04-29"",""divis...",,"[{""gamePk"":634346,""gameDate"":""2021-04-29"",""gam...",,
4,20210430,"[{""gamePk"":634287,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-30"",""t...","[{""home"":1,""gamePk"":634305,""gameDate"":""2021-04...","[{""home"":1,""teamId"":135,""gamePk"":634303,""gameD...",,"[{""season"":2021,""gameDate"":""2021-04-30"",""divis...","[{""awardId"":""NLRRELMON"",""awardName"":""NL Reliev...","[{""gamePk"":634327,""gameDate"":""2021-04-30"",""gam...",,


#### Column: games

In [8]:
unpack_json(df_test['games'][0])


Unnamed: 0,gamePk,gameType,season,gameDate,gameTimeUTC,resumeDate,resumedFrom,codedGameState,detailedGameState,isTie,...,homeWinner,homeScore,awayId,awayName,awayAbbrev,awayWins,awayLosses,awayWinPct,awayWinner,awayScore
0,634374,R,2021,2021-04-26,2021-04-27T01:45:00Z,,,F,Final,False,...,True,12,115,Colorado Rockies,COL,8,14,0.364,False,0
1,634377,R,2021,2021-04-26,2021-04-27T00:05:00Z,,,F,Final,False,...,False,4,108,Los Angeles Angels,LAA,11,10,0.524,True,9
2,634433,R,2021,2021-04-26,2021-04-26T23:05:00Z,,,F,Final,False,...,True,4,147,New York Yankees,NYY,9,13,0.409,False,2
3,634363,R,2021,2021-04-26,2021-04-27T02:10:00Z,,,F,Final,False,...,False,3,113,Cincinnati Reds,CIN,10,12,0.455,True,5
4,634402,R,2021,2021-04-26,2021-04-26T23:10:00Z,,,F,Final,False,...,True,8,112,Chicago Cubs,CHC,10,12,0.455,False,7
5,634461,R,2021,2021-04-26,2021-04-26T23:40:00Z,,,F,Final,False,...,False,0,146,Miami Marlins,MIA,10,12,0.455,True,8
6,634343,R,2021,2021-04-26,2021-04-26T23:10:00Z,,,F,Final,False,...,False,1,133,Oakland Athletics,OAK,15,8,0.652,True,2
7,634345,R,2021,2021-04-26,2021-04-26T23:45:00Z,,,F,Final,False,...,False,1,143,Philadelphia Phillies,PHI,11,11,0.5,True,2
8,634382,R,2021,2021-04-26,2021-04-27T00:10:00Z,,,F,Final,False,...,True,5,136,Seattle Mariners,SEA,13,10,0.565,False,2
9,634394,R,2021,2021-04-26,2021-04-26T22:10:00Z,,,F,Final,False,...,True,5,142,Minnesota Twins,MIN,7,14,0.333,False,3


#### Column: rosters

In [9]:
unpack_json(df_test['rosters'][0]).head()

Unnamed: 0,playerId,gameDate,teamId,statusCode,status
0,405395,2021-04-26,108,A,Active
1,408234,2021-04-26,116,A,Active
2,444482,2021-04-26,109,A,Active
3,445276,2021-04-26,119,A,Active
4,446334,2021-04-26,137,A,Active


In [10]:
unpack_json(df_test['playerBoxScores'][0]).head()

Unnamed: 0,home,gamePk,gameDate,gameTimeUTC,teamId,teamName,playerId,playerName,jerseyNum,positionCode,...,catchersInterferencePitching,sacBuntsPitching,sacFliesPitching,saves,holds,blownSaves,assists,putOuts,errors,chances
0,1,634377,2021-04-26,2021-04-27T00:05:00Z,140,Texas Rangers,624431,Jose Trevino,23,2,...,,,,,,,0,5,0,5
1,0,634451,2021-04-26,2021-04-26T17:10:00Z,118,Kansas City Royals,641531,Hunter Dozier,17,9,...,,,,,,,0,1,0,1
2,1,634377,2021-04-26,2021-04-27T00:05:00Z,140,Texas Rangers,669256,Nick Solak,15,4,...,,,,,,,1,4,0,5
3,0,634382,2021-04-26,2021-04-27T00:10:00Z,136,Seattle Mariners,641786,Kyle Lewis,1,8,...,,,,,,,0,1,0,1
4,1,634377,2021-04-26,2021-04-27T00:05:00Z,140,Texas Rangers,643396,Isiah Kiner-Falefa,9,6,...,,,,,,,3,1,0,4


#### Column: playerBoxScores

In [11]:
unpack_json(df_test['playerBoxScores'][0]).head()

Unnamed: 0,home,gamePk,gameDate,gameTimeUTC,teamId,teamName,playerId,playerName,jerseyNum,positionCode,...,catchersInterferencePitching,sacBuntsPitching,sacFliesPitching,saves,holds,blownSaves,assists,putOuts,errors,chances
0,1,634377,2021-04-26,2021-04-27T00:05:00Z,140,Texas Rangers,624431,Jose Trevino,23,2,...,,,,,,,0,5,0,5
1,0,634451,2021-04-26,2021-04-26T17:10:00Z,118,Kansas City Royals,641531,Hunter Dozier,17,9,...,,,,,,,0,1,0,1
2,1,634377,2021-04-26,2021-04-27T00:05:00Z,140,Texas Rangers,669256,Nick Solak,15,4,...,,,,,,,1,4,0,5
3,0,634382,2021-04-26,2021-04-27T00:10:00Z,136,Seattle Mariners,641786,Kyle Lewis,1,8,...,,,,,,,0,1,0,1
4,1,634377,2021-04-26,2021-04-27T00:05:00Z,140,Texas Rangers,643396,Isiah Kiner-Falefa,9,6,...,,,,,,,3,1,0,4


#### Column: teamBoxScores

In [12]:
unpack_json(df_test['teamBoxScores'][0]).head()

Unnamed: 0,home,teamId,gamePk,gameDate,gameTimeUTC,flyOuts,groundOuts,runsScored,doubles,triples,...,hitBatsmen,balks,wildPitches,pickoffsPitching,rbiPitching,inheritedRunners,inheritedRunnersScored,catchersInterferencePitching,sacBuntsPitching,sacFliesPitching
0,1,139,634343,2021-04-26,2021-04-26T23:10:00Z,5,7,1,0,0,...,2,0,0,0,2,0,0,0,0,0
1,1,117,634382,2021-04-26,2021-04-27T00:10:00Z,4,10,5,1,0,...,1,0,0,0,2,0,0,0,0,0
2,1,138,634345,2021-04-26,2021-04-26T23:45:00Z,3,7,1,0,0,...,0,0,0,0,2,0,0,0,0,0
3,1,116,634451,2021-04-26,2021-04-26T17:10:00Z,5,6,2,2,2,...,0,0,0,0,3,0,0,0,1,1
4,0,147,634433,2021-04-26,2021-04-26T23:05:00Z,6,9,2,2,0,...,1,0,0,0,3,0,0,0,0,0


#### Column: transactions

In [13]:
unpack_json(df_test['transactions'][0]).head()

Unnamed: 0,transactionId,playerId,playerName,date,fromTeamId,fromTeamName,toTeamId,toTeamName,effectiveDate,resolutionDate,typeCode,typeDesc,description
0,480386,543685,Anthony Rendon,2021-04-26,,,108,Los Angeles Angels,2021-04-26,2021-04-26,SC,Status Change,Los Angeles Angels activated 3B Anthony Rendon...
1,479907,664852,Jay Flaa,2021-04-26,3282.0,Orioles Alternate Training Site,110,Baltimore Orioles,2021-04-26,,SE,Selected,Baltimore Orioles selected the contract of RHP...
2,480224,592858,Rowan Wick,2021-04-26,,,112,Chicago Cubs,2021-04-26,2021-04-26,SC,Status Change,Chicago Cubs placed RHP Rowan Wick on the 60-d...
3,480378,519222,Austin Romine,2021-04-26,,,112,Chicago Cubs,2021-04-26,2021-04-26,SC,Status Change,Chicago Cubs placed C Austin Romine on the 10-...
4,480380,656547,Jonathan Holder,2021-04-26,,,112,Chicago Cubs,2021-04-26,2021-04-26,SC,Status Change,Chicago Cubs placed RHP Jonathan Holder on the...


#### Column: standings

In [14]:
unpack_json(df_test['standings'][0]).head()

Unnamed: 0,season,gameDate,divisionId,teamId,teamName,streakCode,divisionRank,leagueRank,wildCardRank,leagueGamesBack,...,grassLosses,turfWins,turfLosses,divWins,divLosses,alWins,alLosses,nlWins,nlLosses,xWinLossPct
0,2021,2021-04-26,205,112,Chicago Cubs,L3,4,11,9.0,4.5,...,12,0,0,6,9,,,6.0,9.0,0.4
1,2021,2021-04-26,204,146,Miami Marlins,W1,4,13,11.0,4.5,...,12,0,0,4,2,1.0,3.0,1.0,3.0,0.25
2,2021,2021-04-26,204,121,New York Mets,W1,1,5,,3.0,...,8,0,0,7,4,,,0.0,3.0,0.0
3,2021,2021-04-26,200,140,Texas Rangers,L4,5,13,10.0,6.0,...,6,6,8,2,2,1.0,5.0,1.0,5.0,0.167
4,2021,2021-04-26,204,144,Atlanta Braves,W1,3,10,8.0,4.5,...,12,0,0,5,8,3.0,1.0,3.0,1.0,0.75


#### Column: awards

In [15]:
unpack_json(df_test['awards'][4]).head()

Unnamed: 0,awardId,awardName,awardDate,awardSeason,playerId,playerName,awardPlayerTeamId
0,NLRRELMON,NL Reliever of the Month,2021-04-30,2021,453343,Mark Melancon,135
1,NLROM,NL Rookie of the Month,2021-04-30,2021,669432,Trevor Rogers,146
2,NLPITOM,NL Pitcher of the Month,2021-04-30,2021,594798,Jacob deGrom,121
3,NLPOM,NL Player of the Month,2021-04-30,2021,660670,Ronald Acuna Jr.,144
4,ALRRELMON,AL Reliever of the Month,2021-04-30,2021,598264,Matt Barnes,111


#### Column: events

In [16]:
unpack_json(df_test['events'][0]).head()

Unnamed: 0,gamePk,gameDate,gameTimeUTC,season,gameType,playId,eventId,inning,halfInning,homeScore,...,vX0,vY0,vZ0,x,y,x0,y0,z0,type,zone
0,634433,2021-04-26,2021-04-26T23:05:00Z,2021,R,,8,6,bottom,2,...,,,,,,,,,action,
1,634451,2021-04-26,2021-04-26T17:10:00Z,2021,R,,0,7,bottom,2,...,,,,,,,,,action,
2,634343,2021-04-26,2021-04-26T23:10:00Z,2021,R,,0,7,bottom,1,...,,,,,,,,,action,
3,634433,2021-04-26,2021-04-26T23:05:00Z,2021,R,,0,8,bottom,4,...,,,,,,,,,action,
4,634394,2021-04-26,2021-04-26T22:10:00Z,2021,R,,4,8,top,2,...,,,,,,,,,action,
