In [1]:
import pandas as pd
import numpy as np

## <font color=black>Leitura de Dados</font> 

In [2]:
# Read all files 
country = pd.read_csv('Country_Original.csv')
league = pd.read_csv('League_Original.csv')
match = pd.read_csv('Match_Original.csv')
player = pd.read_csv('Player_Original.csv')
player_attributes = pd.read_csv('Player_Attributes_Original.csv')
team = pd.read_csv('Team_Original.csv')
team_attributes = pd.read_csv('Team_Attributes_Original.csv')

pd.set_option('display.max_columns', None)

## <font color=black>Organização e Limpeza de Dados</font> 

Pretendemos nesta secção do trabalho reduzir e tratar os dados de forma a que não exista informação irrelevante para o modelo que pretendemos construir.

### <font color=black> - Eliminação de Valores Nulos</font>

In [3]:
# Eliminar atributos relativos à posição dos jogadores
player_attributes = player_attributes.drop(player_attributes.columns.to_series()["potential":], axis=1)

In [4]:
# Eliminar atributos relativos à posição dos jogadores
match = match.drop(match.columns.to_series()["home_player_X1":"away_player_Y11"], axis=1)
# Eliminar os seguintes atributos -> shoton, shotoff, foulcommit, card, cross, corner, possession
match = match.drop(match.columns.to_series()["goal":"possession"], axis=1)

In [5]:
#Organizar dados por datas, do passado para o presente
match['date'] = match['date'].astype('datetime64[ns]')
match = match.sort_values(by=['date'], ascending=True)

In [6]:
#Eliminar todos as linhas que não tenham nenhuma info sobre os jogadores
match = match.dropna(axis=0, how="all", subset=match.columns.to_series()["home_player_1":"away_player_11"])

In [7]:
#Eliminar todas as linhas que não tenham nenhuma odd relativa ao jogo
match = match.dropna(axis=0, how="all", subset=match.columns.to_series()["B365H":])

In [8]:
#Eliminar partidas em que não são conhecidos pelo menos 8 jogadores que jogaram (para a Equipa da Casa)
match = match.dropna(axis=0, thresh=8, subset=match.columns.to_series()["home_player_1":"home_player_11"])
#Verificação
match.loc[match.loc[:,'home_player_1':'home_player_11'].count(axis=1) < 8]

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA


In [9]:
#Eliminar partidas em que não são conhecidos pelo menos 8 jogadores que jogaram (para a Equipa de Fora)
match = match.dropna(axis=0, thresh=8, subset=match.columns.to_series()["away_player_1":"away_player_11"])
#Verificação
match.loc[match.loc[:,'away_player_1':'away_player_11'].count(axis=1) < 8]

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA


### <font color=black> - Substituição de Valores Nulos </font>

In [10]:
#Passar o formato dos atributos 'date' para formato de tempo
match['date'] = match['date'].astype('datetime64[ns]') 
player_attributes['date'] = player_attributes['date'].astype('datetime64[ns]') 


In [43]:
#Cópia do dataframe original
match_copy_test=  pd.DataFrame(match.head(10))

for index, row in match_copy_test.iterrows():
    player_skill = player_attributes.loc[player_attributes['player_api_id'] == row['home_player_1']]
    date_season = player_skill[player_skill['date'] < row['date']]
    #date_season = player_skill.loc[player_skill['date'] <= row['date']]
    
    #recent_date = date_season.loc[date_season.date.idxmax()]
    
    #display(player_attributes.iloc[recent_date.id])
   
    
    
    
    '''skill_data = pd.DataFrame(recent_skill)
    display(skill_data)'''
    
    
    
#Put away players rating in match    
#for i in range(1,12):
    #result_away = pd.merge(match, player_attributes[['player_api_id','overall_rating']], left_on="away_player_"+"%s" % i, right_on='player_api_id', how='left')
    #match["away_player_"+"%s" % i] = result_away["overall_rating"]

id                                  57706
player_fifa_api_id                    268
player_api_id                       33826
date                  2007-08-30 00:00:00
overall_rating                         68
Name: 57705, dtype: object

id                                 170990
player_fifa_api_id                 135594
player_api_id                       23794
date                  2007-08-30 00:00:00
overall_rating                         74
Name: 170989, dtype: object

id                                  39732
player_fifa_api_id                    185
player_api_id                       36286
date                  2007-08-30 00:00:00
overall_rating                         81
Name: 39731, dtype: object

id                                 158432
player_fifa_api_id                 157804
player_api_id                       36373
date                  2007-08-30 00:00:00
overall_rating                         77
Name: 158431, dtype: object

id                                 153893
player_fifa_api_id                 162473
player_api_id                       22978
date                  2007-08-30 00:00:00
overall_rating                         61
Name: 153892, dtype: object

id                                 171760
player_fifa_api_id                  16254
player_api_id                       31465
date                  2007-08-30 00:00:00
overall_rating                         79
Name: 171759, dtype: object

id                                  70626
player_fifa_api_id                 135451
player_api_id                       30455
date                  2007-08-30 00:00:00
overall_rating                         81
Name: 70625, dtype: object

id                                  48277
player_fifa_api_id                  51539
player_api_id                       30726
date                  2007-08-30 00:00:00
overall_rating                         85
Name: 48276, dtype: object

id                                 171979
player_fifa_api_id                  53012
player_api_id                       27313
date                  2007-08-30 00:00:00
overall_rating                         82
Name: 171978, dtype: object

id                                  63083
player_fifa_api_id                  29183
player_api_id                       33339
date                  2007-08-30 00:00:00
overall_rating                         72
Name: 63082, dtype: object

In [15]:
'''match_copy_test= match.head()
match_copy_test.head()'''
player_attributes.head()


Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating
0,1,218353,505942,2016-02-18,67.0
1,2,218353,505942,2015-11-19,67.0
2,3,218353,505942,2015-09-21,62.0
3,4,218353,505942,2015-03-20,61.0
4,5,218353,505942,2007-02-22,61.0


In [16]:
max(player_attributes.head()['date'])

Timestamp('2016-02-18 00:00:00')