In [1]:
import pandas as pd
import numpy as np

## <font color=black>Leitura de Dados</font> 

In [2]:
# Read all files 
country = pd.read_csv('Country_Original.csv')
league = pd.read_csv('League_Original.csv')
match = pd.read_csv('Match_Original.csv')
player = pd.read_csv('Player_Original.csv')
player_attributes = pd.read_csv('Player_Attributes_Original.csv')
team = pd.read_csv('Team_Original.csv')
team_attributes = pd.read_csv('Team_Attributes_Original.csv')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## <font color=black>Organização e Limpeza de Dados</font> 

Pretendemos nesta secção do trabalho reduzir e tratar os dados de forma a que não exista informação irrelevante para o modelo que pretendemos construir.

### <font color=black> - Eliminação de Valores Nulos</font>

In [3]:
# Eliminar atributos relativos à posição dos jogadores
player_attributes = player_attributes.drop(player_attributes.columns.to_series()["potential":], axis=1)


In [4]:
# Eliminar duplicados caso existam
player_attributes.drop_duplicates(keep = False, inplace = True)

In [5]:
# Verficar se existe overall_ratings null
player_attributes.dropna(axis=0, how="any", subset =['overall_rating'], inplace=True)
player_attributes['overall_rating'].isnull().sum()

0

In [6]:
# Eliminar atributos relativos à posição dos jogadores
match = match.drop(match.columns.to_series()["home_player_X1":"away_player_Y11"], axis=1)
# Eliminar os seguintes atributos -> shoton, shotoff, foulcommit, card, cross, corner, possession
match = match.drop(match.columns.to_series()["goal":"possession"], axis=1)

In [7]:
#Organizar dados por datas, do passado para o presente
match['date'] = match['date'].astype('datetime64[ns]')
match = match.sort_values(by=['date'], ascending=True)

In [8]:
#Eliminar todos as linhas que não tenham nenhuma info sobre os jogadores
match = match.dropna(axis=0, how="all", subset=match.columns.to_series()["home_player_1":"away_player_11"])

In [9]:
#Eliminar todas as linhas que não tenham nenhuma odd relativa ao jogo
match.dropna(axis=0, how="all", subset=match.columns.to_series()["B365H":], inplace=True)

In [10]:
#Eliminar partidas em que não são conhecidos pelo menos 8 jogadores que jogaram (para a Equipa da Casa)
match = match.dropna(axis=0, thresh=8, subset=match.columns.to_series()["home_player_1":"home_player_11"])
#Verificação
match.loc[match.loc[:,'home_player_1':'home_player_11'].count(axis=1) < 8]

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA


In [11]:
#Eliminar partidas em que não são conhecidos pelo menos 8 jogadores que jogaram (para a Equipa de Fora)
match = match.dropna(axis=0, thresh=8, subset=match.columns.to_series()["away_player_1":"away_player_11"])
#Verificação
match.loc[match.loc[:,'away_player_1':'away_player_11'].count(axis=1) < 8]

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA


### <font color=black> - Substituição de Valores Nulos </font>

In [12]:
#Passar o formato dos atributos 'date' para formato de tempo
match['date'] = match['date'].astype('datetime64[ns]') 
player_attributes['date'] = player_attributes['date'].astype('datetime64[ns]') 


In [13]:
#Cópia do dataframe original
match_copy_test=  match.copy()

match_copy_test = match_copy_test.head(100)


for i in range(1,12):
    for index, row in match_copy_test.iterrows():
        
        if(row["home_player_"+"%s" % i] == np.nan):
            continue
        if(row["away_player_"+"%s" % i]== np.nan):
            continue
        try:    
            player_skill_home = player_attributes.loc[player_attributes['player_api_id'] == row["home_player_"+"%s" % i]]
            date_season_home = player_skill_home.loc[player_skill_home['date'] <= row['date']]

            recent_date_home = date_season_home.loc[[date_season_home.date.idxmax()]]
            match_copy_test.at[row['id']-1,'home_player_'+'%s' % i] = recent_date_home['overall_rating']

            player_skill_away = player_attributes.loc[player_attributes['player_api_id'] == row["away_player_"+"%s" % i]]
            date_season_away = player_skill_away.loc[player_skill_away['date'] <= row['date']]

            recent_date_away = date_season_away.loc[[date_season_away.date.idxmax()]]
            match_copy_test.at[row['id']-1,'away_player_'+'%s' % i] = recent_date_away['overall_rating'].tolist()[0]
            
            print(row['id']-1,"-------",recent_date_away['player_api_id'].tolist()[0] ,recent_date_away['overall_rating'].tolist()[0])
                
        except ValueError:
            pass
  
    
    
#Put away players rating in match    
#for i in range(1,12):
    #result_away = pd.merge(match, player_attributes[['player_api_id','overall_rating']], left_on="away_player_"+"%s" % i, right_on='player_api_id', how='left')
    #match["away_player_"+"%s" % i] = result_away["overall_rating"]

21888 ------- 32657 82.0
1756 ------- 23686 75.0
1755 ------- 34421 77.0
1754 ------- 30622 81.0
1751 ------- 36374 77.0
1750 ------- 30633 77.0
1749 ------- 30660 84.0
1748 ------- 23021 68.0
7831 ------- 27299 80.0
7832 ------- 25524 84.0
19710 ------- 69259 63.0
19709 ------- 25199 65.0
19708 ------- 32693 67.0
10269 ------- 30989 79.0
7834 ------- 37377 74.0
21896 ------- 37579 73.0
21895 ------- 33845 67.0
10273 ------- 39351 76.0
19706 ------- 11024 62.0
19705 ------- 32679 63.0
7829 ------- 26173 78.0
1757 ------- 24527 68.0
21887 ------- 41466 74.0
4803 ------- 26343 80.0
4804 ------- 30458 81.0
4805 ------- 30742 82.0
4807 ------- 11321 74.0
4801 ------- 26190 71.0
4800 ------- 26252 66.0
4798 ------- 41301 79.0
7826 ------- 36058 65.0
17860 ------- 40604 68.0
1788 ------- 36374 77.0
4806 ------- 26359 78.0
10274 ------- 41243 64.0
10272 ------- 37503 74.0
10271 ------- 42042 70.0
10270 ------- 39477 72.0
4802 ------- 26126 79.0
10268 ------- 39204 73.0
19707 ------- 30974 82.

7828 ------- 17312 76.0
10266 ------- 41882 66.0
4799 ------- 38729 69.0
10267 ------- 24504 73.0
1753 ------- 23939 79.0
21889 ------- 33990 80.0
21894 ------- 34520 86.0
21893 ------- 11778 68.0
21891 ------- 38561 68.0
21892 ------- 34104 77.0
8092 ------- 36076 73.0
1752 ------- 43280 75.0
19910 ------- 32670 71.0
19912 ------- 32786 71.0
19911 ------- 23151 64.0
2086 ------- 24163 75.0
19909 ------- 23729 60.0
21863 ------- 30750 83.0
8095 ------- 26383 60.0
8094 ------- 38215 71.0
8090 ------- 28480 83.0
8088 ------- 30693 80.0
8087 ------- 28435 60.0
21857 ------- 30689 82.0
10602 ------- 41874 57.0
10600 ------- 39185 64.0
2084 ------- 26777 77.0
5128 ------- 46877 53.0
5129 ------- 40677 70.0
5130 ------- 40016 71.0
5131 ------- 26119 83.0
5137 ------- 30823 78.0
2078 ------- 38835 85.0
2083 ------- 30865 85.0
5132 ------- 11327 66.0
5134 ------- 26345 70.0
19915 ------- 32661 70.0
5135 ------- 26155 82.0
8093 ------- 37787 78.0
21859 ------- 38561 68.0
21860 ------- 80497 56.

8089 ------- 25483 74.0
5133 ------- 35499 75.0
10603 ------- 56585 68.0
10601 ------- 31288 82.0
10599 ------- 30721 82.0
10598 ------- 18816 75.0
10597 ------- 27694 76.0
10596 ------- 30940 72.0
8091 ------- 31290 71.0
21864 ------- 30666 79.0
21888 ------- 154257 60.0
1756 ------- 23678 72.0
1755 ------- 25150 71.0
1754 ------- 30849 74.0
1751 ------- 24747 73.0
1750 ------- 30338 72.0
1749 ------- 38818 80.0
1748 ------- 30595 79.0
7830 ------- 27336 68.0
7831 ------- 38913 72.0
7832 ------- 39106 76.0
19710 ------- 43275 66.0
19709 ------- 34447 71.0
19708 ------- 102777 64.0
10269 ------- 30460 86.0
7834 ------- 37389 69.0
21896 ------- 32999 60.0
21895 ------- 38160 71.0
10273 ------- 27695 75.0
19706 ------- 32924 64.0
19705 ------- 32664 63.0
7829 ------- 27550 72.0
1757 ------- 24655 78.0
21887 ------- 30958 80.0
4803 ------- 26353 75.0
4804 ------- 39963 65.0
4805 ------- 41174 73.0
4807 ------- 26176 67.0
4801 ------- 5217 74.0
4800 ------- 31318 70.0
4798 ------- 41303 63

19910 ------- 41592 66.0
19912 ------- 39660 79.0
19911 ------- 37290 68.0
2086 ------- 38491 77.0
19909 ------- 37243 55.0
21863 ------- 22234 79.0
8095 ------- 69864 65.0
8094 ------- 38848 69.0
8090 ------- 25366 79.0
8088 ------- 36788 82.0
8087 ------- 96209 64.0
21857 ------- 38700 83.0
10602 ------- 32749 84.0
10600 ------- 24456 78.0
2084 ------- 24741 76.0
5128 ------- 111068 59.0
5129 ------- 37950 72.0
5130 ------- 37756 78.0
5131 ------- 39968 64.0
5137 ------- 34037 78.0
2078 ------- 30960 87.0
2083 ------- 30829 90.0
5132 ------- 35410 77.0
5134 ------- 41238 65.0
19915 ------- 133997 64.0
5135 ------- 26347 77.0
8093 ------- 30496 81.0
21859 ------- 38526 70.0
21860 ------- 33892 74.0
21862 ------- 75310 71.0
21858 ------- 32764 84.0
5136 ------- 51371 69.0
10604 ------- 92666 75.0
8089 ------- 23819 76.0
5133 ------- 26312 70.0
10603 ------- 35421 75.0
10601 ------- 32747 79.0
10599 ------- 46554 83.0
10598 ------- 37520 72.0
10597 ------- 38822 80.0
10596 ------- 27690

In [14]:
match_copy_test.head(100)


Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
21888,21889,21518,21518,2008/2009,9,2008-01-11,530224,9864,8634,1,4,68.0,56.0,68.0,62.0,69.0,76.0,50.0,67.0,58.0,73.0,63.0,82.0,85.0,73.0,85.0,88.0,86.0,82.0,60.0,86.0,89.0,84.0,9.0,4.5,1.36,8.1,3.75,1.4,7.0,3.6,1.45,6.0,3.75,1.44,,,,8.0,3.75,1.36,9.0,4.5,1.36,8.0,4.2,1.4,8.5,4.0,1.4,7.5,3.75,1.44
1756,1757,1729,1729,2008/2009,11,2008-01-11,489150,10194,9825,2,1,74.0,68.0,76.0,58.0,61.0,77.0,73.0,65.0,65.0,63.0,67.0,75.0,76.0,86.0,76.0,77.0,75.0,86.0,72.0,74.0,79.0,73.0,8.0,4.5,1.4,7.5,4.0,1.4,6.0,3.9,1.45,8.0,3.75,1.36,,,,8.0,4.5,1.4,7.5,4.33,1.44,8.0,4.8,1.4,8.0,4.0,1.45,8.5,4.0,1.4
1755,1756,1729,1729,2008/2009,11,2008-01-11,489149,8462,8528,1,2,81.0,75.0,79.0,81.0,74.0,77.0,73.0,79.0,63.0,76.0,84.0,77.0,76.0,72.0,69.0,69.0,76.0,67.0,71.0,78.0,77.0,71.0,1.91,3.4,4.2,1.8,3.3,4.15,1.8,3.2,4.0,1.91,3.2,3.5,,,,2.0,3.3,3.8,1.91,3.25,4.33,1.95,3.45,4.1,1.95,3.3,4.0,1.91,3.4,3.75
1754,1755,1729,1729,2008/2009,11,2008-01-11,489148,8659,8655,2,2,77.0,68.0,71.0,70.0,69.0,72.0,71.0,73.0,70.0,72.0,68.0,81.0,70.0,78.0,79.0,58.0,77.0,58.0,74.0,74.0,76.0,86.0,2.5,3.3,2.88,2.35,3.15,2.8,2.5,3.2,2.5,2.38,3.2,2.6,,,,2.4,3.3,2.9,2.5,3.25,2.8,2.55,3.35,2.8,2.5,3.3,2.75,2.5,3.2,2.7
1751,1752,1729,1729,2008/2009,11,2008-01-11,489145,8549,8654,1,1,61.0,63.0,75.0,70.0,66.0,77.0,75.0,71.0,82.0,77.0,84.0,77.0,77.0,73.0,79.0,75.0,77.0,51.0,73.0,78.0,79.0,66.0,2.0,3.3,4.0,2.05,3.15,3.35,1.9,3.3,3.5,1.91,3.25,3.4,,,,2.05,3.3,3.6,2.0,3.25,4.0,1.95,3.35,4.2,2.0,3.3,3.75,1.91,3.4,3.75
1750,1751,1729,1729,2008/2009,11,2008-01-11,489144,8668,9879,1,0,79.0,78.0,80.0,73.0,79.0,76.0,73.0,73.0,82.0,78.0,81.0,77.0,67.0,71.0,77.0,74.0,78.0,80.0,72.0,77.0,76.0,85.0,1.67,3.75,5.5,1.7,3.35,4.65,1.7,3.4,4.3,1.73,3.2,4.33,,,,1.73,3.5,5.0,1.73,3.3,5.5,1.65,3.7,5.8,1.73,3.5,4.75,1.7,3.4,5.0
1749,1750,1729,1729,2008/2009,11,2008-01-11,489143,8586,8650,2,1,81.0,72.0,84.0,82.0,72.0,83.0,80.0,77.0,52.0,71.0,85.0,84.0,74.0,88.0,83.0,60.0,84.0,89.0,80.0,84.0,79.0,84.0,3.5,3.3,2.1,3.65,3.2,1.95,3.2,3.1,2.1,3.2,3.2,2.0,,,,3.5,3.3,2.1,3.6,3.4,2.05,3.35,3.5,2.15,3.6,3.35,2.05,3.4,3.2,2.1
1748,1749,1729,1729,2008/2009,11,2008-01-11,489142,10260,8667,4,3,85.0,80.0,88.0,85.0,78.0,91.0,83.0,82.0,83.0,85.0,90.0,68.0,74.0,60.0,67.0,55.0,68.0,71.0,79.0,78.0,76.0,71.0,1.17,7.0,17.0,1.17,6.0,13.0,1.17,5.7,12.0,1.2,5.0,11.0,,,,1.17,6.5,19.0,1.17,6.5,19.0,1.17,7.0,19.0,1.18,6.0,17.0,1.17,6.5,13.0
7830,7831,7809,7809,2008/2009,11,2008-01-11,499411,8697,8177,5,1,82.0,71.0,87.0,83.0,79.0,76.0,86.0,71.0,88.0,79.0,79.0,,73.0,84.0,80.0,67.0,64.0,72.0,68.0,74.0,81.0,82.0,1.75,3.5,4.75,1.7,3.4,4.6,1.8,3.2,4.0,1.67,3.4,4.0,,,,1.67,3.4,4.33,1.62,3.75,5.5,1.7,3.4,4.35,1.7,3.4,4.75,1.7,3.4,4.5
7831,7832,7809,7809,2008/2009,11,2008-01-11,499412,8398,10189,0,2,72.0,67.0,71.0,74.0,67.0,72.0,64.0,66.0,71.0,62.0,75.0,80.0,81.0,78.0,88.0,77.0,73.0,81.0,72.0,75.0,83.0,79.0,4.5,3.5,1.8,4.8,3.3,1.7,4.0,3.2,1.8,4.0,3.4,1.73,,,,4.33,3.4,1.67,4.75,3.4,1.8,4.35,3.4,1.7,4.5,3.4,1.75,4.5,3.4,1.7
