# Pré-Processamento dos dados

## Sumário
a) Aquisição dos dados

b) Importando as bibliotecas

c) Leitura dos arquivos

d) Valores faltantes

e) Feature engineering

f) Encoding das variáveis categóricas

g) Normalização das variáveis numéricas

h) Salvando arquivos essenciais para as próximas etapas

---

#### a) Aquisição dos dados

Dados adquiridos através do RStudio, utilizando uma API própria para fazer a raspagem desses tipo de dados. Pode ser encontrada neste link: https://github.com/JaseZiv/worldfootballR

#### b) Importando as bibliotecas

In [1]:
import sys
import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, '../') # set home directory
from src.features.funcs import *

import warnings
warnings.filterwarnings("ignore")

#### c) Leitura dos arquivos

In [2]:
df_xg = pd.read_csv('../data/intermediate/top5_xg.csv')
print(df_xg.shape)
df_xg.head()

(362781, 20)


Unnamed: 0,league,minute,result,X,Y,xG,player,h_a,player_id,situation,season,shotType,match_id,home_team,away_team,home_goals,away_goals,date,player_assisted,lastAction
0,Bundesliga,11,SavedShot,0.903,0.239,0.034118,Thomas Müller,h,224,OpenPlay,2014,RightFoot,5447,Bayern Munich,Wolfsburg,2,1,2014-08-22 19:30:00,Philipp Lahm,Chipped
1,Bundesliga,17,SavedShot,0.852,0.277,0.030941,Arjen Robben,h,392,OpenPlay,2014,LeftFoot,5447,Bayern Munich,Wolfsburg,2,1,2014-08-22 19:30:00,Philipp Lahm,Pass
2,Bundesliga,26,BlockedShot,0.803,0.277,0.021718,Arjen Robben,h,392,OpenPlay,2014,LeftFoot,5447,Bayern Munich,Wolfsburg,2,1,2014-08-22 19:30:00,Holger Badstuber,Chipped
3,Bundesliga,28,SavedShot,0.871,0.324,0.050345,Thomas Müller,h,224,OpenPlay,2014,LeftFoot,5447,Bayern Munich,Wolfsburg,2,1,2014-08-22 19:30:00,nobody,
4,Bundesliga,29,SavedShot,0.918,0.531,0.111078,Robert Lewandowski,h,227,OpenPlay,2014,RightFoot,5447,Bayern Munich,Wolfsburg,2,1,2014-08-22 19:30:00,Gianluca Gaudino,Chipped


#### d) Valores faltantes

In [3]:
check_nan(df_xg)
#df_xg.dropna(inplace=True)

Unnamed: 0,feature,missing_total,percentage
0,lastAction,42464,0.117


#### e) Feature engineering

In [4]:
# Criação de features que possam trazer ganho durante a modelagem e/ou análises
df_xg['player_team'] = np.where(df_xg.h_a == 'h',df_xg.home_team, df_xg.away_team)
df_xg['opponent_team'] = np.where(df_xg.h_a == 'a',df_xg.home_team, df_xg.away_team)
df_xg['winner_team'] = np.where(df_xg.home_goals > df_xg.away_goals,df_xg.home_team, 
                                np.where(df_xg.home_goals > df_xg.away_goals,df_xg.away_team,'Tie'))
df_xg['half'] = np.where(df_xg.minute <= 45,'first', 'second')

# Remoção de algumas features
df_model = df_xg.drop(['result','match_id','date'], axis=1)
df_model

Unnamed: 0,league,minute,X,Y,xG,player,h_a,player_id,situation,season,...,home_team,away_team,home_goals,away_goals,player_assisted,lastAction,player_team,opponent_team,winner_team,half
0,Bundesliga,11,0.903,0.239,0.034118,Thomas Müller,h,224,OpenPlay,2014,...,Bayern Munich,Wolfsburg,2,1,Philipp Lahm,Chipped,Bayern Munich,Wolfsburg,Bayern Munich,first
1,Bundesliga,17,0.852,0.277,0.030941,Arjen Robben,h,392,OpenPlay,2014,...,Bayern Munich,Wolfsburg,2,1,Philipp Lahm,Pass,Bayern Munich,Wolfsburg,Bayern Munich,first
2,Bundesliga,26,0.803,0.277,0.021718,Arjen Robben,h,392,OpenPlay,2014,...,Bayern Munich,Wolfsburg,2,1,Holger Badstuber,Chipped,Bayern Munich,Wolfsburg,Bayern Munich,first
3,Bundesliga,28,0.871,0.324,0.050345,Thomas Müller,h,224,OpenPlay,2014,...,Bayern Munich,Wolfsburg,2,1,nobody,,Bayern Munich,Wolfsburg,Bayern Munich,first
4,Bundesliga,29,0.918,0.531,0.111078,Robert Lewandowski,h,227,OpenPlay,2014,...,Bayern Munich,Wolfsburg,2,1,Gianluca Gaudino,Chipped,Bayern Munich,Wolfsburg,Bayern Munich,first
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362776,Ligue_1,69,0.976,0.387,0.301148,Nicholas Gioacchini,a,9973,OpenPlay,2021,...,Angers,Montpellier,2,0,Béni Makouana,Pass,Montpellier,Angers,Angers,second
362777,Ligue_1,81,0.843,0.747,0.020374,Ambroise Oyongo,a,6734,OpenPlay,2021,...,Angers,Montpellier,2,0,Pedro Mendes,Chipped,Montpellier,Angers,Angers,second
362778,Ligue_1,81,0.948,0.723,0.023237,Nicholas Gioacchini,a,9973,OpenPlay,2021,...,Angers,Montpellier,2,0,nobody,Rebound,Montpellier,Angers,Angers,second
362779,Ligue_1,84,0.744,0.291,0.010909,Junior Sambia,a,6239,OpenPlay,2021,...,Angers,Montpellier,2,0,nobody,BallRecovery,Montpellier,Angers,Angers,second


#### f) Encoding das variáveis categóricas

Segundo um [estudo](https://towardsdatascience.com/one-hot-encoding-is-making-your-tree-based-ensembles-worse-heres-why-d64b282b5769?gi=36d45aec2dab) feito por um aluno de mestrado em ciência de dados na Universidade da Virgínia, o tradicional One-hot Encoding não funciona tão bem para modelos de árvores de decisão, dito isso utilizaremos o Target Encoding. 

In [5]:
df_model = cat_targetencoding(df_model, ['league', 'h_a', 'situation', 'shotType', 'home_team',
       'away_team', 'lastAction', 'player_team', 'opponent_team', 'winner_team', 'half'], 'xG')
df_model

Unnamed: 0,minute,X,Y,xG,player,player_id,season,home_goals,away_goals,player_assisted,...,h_a_encoded,situation_encoded,shotType_encoded,home_team_encoded,away_team_encoded,lastAction_encoded,player_team_encoded,opponent_team_encoded,winner_team_encoded,half_encoded
0,11,0.903,0.239,0.034118,Thomas Müller,224,2014,2,1,Philipp Lahm,...,0.111529,0.105887,0.110808,0.131004,0.108075,0.105594,0.134900,0.109731,0.136038,0.105012
1,17,0.852,0.277,0.030941,Arjen Robben,392,2014,2,1,Philipp Lahm,...,0.111529,0.105887,0.103835,0.131004,0.108075,0.084854,0.134900,0.109731,0.136038,0.105012
2,26,0.803,0.277,0.021718,Arjen Robben,392,2014,2,1,Holger Badstuber,...,0.111529,0.105887,0.103835,0.131004,0.108075,0.105594,0.134900,0.109731,0.136038,0.105012
3,28,0.871,0.324,0.050345,Thomas Müller,224,2014,2,1,nobody,...,0.111529,0.105887,0.103835,0.131004,0.108075,0.082744,0.134900,0.109731,0.136038,0.105012
4,29,0.918,0.531,0.111078,Robert Lewandowski,227,2014,2,1,Gianluca Gaudino,...,0.111529,0.105887,0.110808,0.131004,0.108075,0.105594,0.134900,0.109731,0.136038,0.105012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362776,69,0.976,0.387,0.301148,Nicholas Gioacchini,9973,2021,2,0,Béni Makouana,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.084854,0.099829,0.109946,0.113160,0.113286
362777,81,0.843,0.747,0.020374,Ambroise Oyongo,6734,2021,2,0,Pedro Mendes,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.105594,0.099829,0.109946,0.113160,0.113286
362778,81,0.948,0.723,0.023237,Nicholas Gioacchini,9973,2021,2,0,nobody,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.221350,0.099829,0.109946,0.113160,0.113286
362779,84,0.744,0.291,0.010909,Junior Sambia,6239,2021,2,0,nobody,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.058744,0.099829,0.109946,0.113160,0.113286


#### g) Normalização das variáveis numéricas

In [6]:
cols = list(df_model.columns)
cols.remove('minute')

df_model = norm_features(df_model, cols)
df_model

Unnamed: 0,minute,X,Y,xG,player,player_id,season,home_goals,away_goals,player_assisted,...,h_a_encoded,situation_encoded,shotType_encoded,home_team_encoded,away_team_encoded,lastAction_encoded,player_team_encoded,opponent_team_encoded,winner_team_encoded,half_encoded
0,0.104762,0.903,0.239,0.034118,Thomas Müller,224,2014,2,1,Philipp Lahm,...,0.111529,0.105887,0.110808,0.131004,0.108075,0.105594,0.134900,0.109731,0.136038,0.105012
1,0.161905,0.852,0.277,0.030941,Arjen Robben,392,2014,2,1,Philipp Lahm,...,0.111529,0.105887,0.103835,0.131004,0.108075,0.084854,0.134900,0.109731,0.136038,0.105012
2,0.247619,0.803,0.277,0.021718,Arjen Robben,392,2014,2,1,Holger Badstuber,...,0.111529,0.105887,0.103835,0.131004,0.108075,0.105594,0.134900,0.109731,0.136038,0.105012
3,0.266667,0.871,0.324,0.050345,Thomas Müller,224,2014,2,1,nobody,...,0.111529,0.105887,0.103835,0.131004,0.108075,0.082744,0.134900,0.109731,0.136038,0.105012
4,0.276190,0.918,0.531,0.111078,Robert Lewandowski,227,2014,2,1,Gianluca Gaudino,...,0.111529,0.105887,0.110808,0.131004,0.108075,0.105594,0.134900,0.109731,0.136038,0.105012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362776,0.657143,0.976,0.387,0.301148,Nicholas Gioacchini,9973,2021,2,0,Béni Makouana,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.084854,0.099829,0.109946,0.113160,0.113286
362777,0.771429,0.843,0.747,0.020374,Ambroise Oyongo,6734,2021,2,0,Pedro Mendes,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.105594,0.099829,0.109946,0.113160,0.113286
362778,0.771429,0.948,0.723,0.023237,Nicholas Gioacchini,9973,2021,2,0,nobody,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.221350,0.099829,0.109946,0.113160,0.113286
362779,0.800000,0.744,0.291,0.010909,Junior Sambia,6239,2021,2,0,nobody,...,0.107097,0.105887,0.110808,0.106130,0.102067,0.058744,0.099829,0.109946,0.113160,0.113286


#### h) Salvando arquivos essenciais para as próximas etapas

In [7]:
df_xg.to_csv('../data/processed/clean_data.csv', decimal=',', index=False)
df_model.to_csv('../data/processed/model_data.csv', decimal=',', index=False)