In [1]:
import pandas as pd
import numpy as np

## <font color=black>Leitura de Dados</font> 

In [2]:
# Read all files 
country = pd.read_csv('Country_Original.csv')
league = pd.read_csv('League_Original.csv')
match = pd.read_csv('Match_Original.csv')
player = pd.read_csv('Player_Original.csv')
player_attributes = pd.read_csv('Player_Attributes_Original.csv')
team = pd.read_csv('Team_Original.csv')
team_attributes = pd.read_csv('Team_Attributes_Original.csv')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## <font color=black>Organização e Limpeza de Dados</font> 

Pretendemos nesta secção do trabalho reduzir e tratar os dados de forma a que não exista informação irrelevante para o modelo que pretendemos construir.

### <font color=black> - Eliminação de Valores Nulos</font>

In [3]:
# Eliminar atributos relativos à posição dos jogadores
player_attributes.drop(player_attributes.columns.to_series()["potential":], axis=1, inplace = True)


In [4]:
# Eliminar duplicados caso existam
player_attributes.drop_duplicates(keep = False, inplace = True)

In [5]:
# Verficar se existe overall_ratings null
player_attributes.dropna(axis=0, how="any", subset =['overall_rating'], inplace=True)
player_attributes['overall_rating'].isnull().sum()

0

In [6]:
# Eliminar atributos relativos à posição dos jogadores
match.drop(match.columns.to_series()["home_player_X1":"away_player_Y11"], axis=1, inplace = True)
# Eliminar os seguintes atributos -> shoton, shotoff, foulcommit, card, cross, corner, possession
match.drop(match.columns.to_series()["goal":"possession"], axis=1,inplace = True)

In [7]:
#Organizar dados por datas, do passado para o presente
match['date'] = match['date'].astype('datetime64[ns]')
match = match.sort_values(by=['date'], ascending=True)

In [8]:
#Eliminar todos as linhas que não tenham nenhuma info sobre os jogadores
match = match.dropna(axis=0, how="all", subset=match.columns.to_series()["home_player_1":"away_player_11"])

In [9]:
#Eliminar todas as linhas que não tenham nenhuma odd relativa ao jogo
match.dropna(axis=0, how="all", subset=match.columns.to_series()["B365H":], inplace=True)

In [10]:
#Eliminar partidas em que não são conhecidos pelo menos 8 jogadores que jogaram (para a Equipa da Casa)
match = match.dropna(axis=0, thresh=8, subset=match.columns.to_series()["home_player_1":"home_player_11"])
#Verificação
match.loc[match.loc[:,'home_player_1':'home_player_11'].count(axis=1) < 8]

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA


In [11]:
#Eliminar partidas em que não são conhecidos pelo menos 8 jogadores que jogaram (para a Equipa de Fora)
match = match.dropna(axis=0, thresh=8, subset=match.columns.to_series()["away_player_1":"away_player_11"])
#Verificação
match.loc[match.loc[:,'away_player_1':'away_player_11'].count(axis=1) < 8]

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA


### <font color=black> - Substituição de Valores Nulos </font>

In [12]:
#Passar o formato dos atributos 'date' para formato de tempo
match['date'] = match['date'].astype('datetime64[ns]') 
player_attributes['date'] = player_attributes['date'].astype('datetime64[ns]') 


In [13]:
match = pd.append(player['player_api_id','heigth','weigth'], how='left', left_on='home_player_1', right_on='player_api_id')

KeyError: ('player_api_id', 'heigth', 'weigth')

In [None]:
match.head()

In [None]:
#Cópia do dataframe original
match_copy_test=  match.copy()

match_copy_test = match_copy_test.head(100)


for i in range(1,12):
    for index, row in match_copy_test.iterrows():
        
        if(row["home_player_"+"%s" % i] == np.nan):
            continue
        if(row["away_player_"+"%s" % i]== np.nan):
            continue
        try:    
            player_skill_home = player_attributes.loc[player_attributes['player_api_id'] == row["home_player_"+"%s" % i]]
            date_season_home = player_skill_home.loc[player_skill_home['date'] <= row['date']]

            recent_date_home = date_season_home.loc[[date_season_home.date.idxmax()]]
            match_copy_test.at[row['id']-1,'home_player_'+'%s' % i] = recent_date_home['overall_rating']

            player_skill_away = player_attributes.loc[player_attributes['player_api_id'] == row["away_player_"+"%s" % i]]
            date_season_away = player_skill_away.loc[player_skill_away['date'] <= row['date']]

            recent_date_away = date_season_away.loc[[date_season_away.date.idxmax()]]
            match_copy_test.at[row['id']-1,'away_player_'+'%s' % i] = recent_date_away['overall_rating'].tolist()[0]
            
            #print(row['id']-1,"-------",recent_date_away['player_api_id'].tolist()[0] ,recent_date_away['overall_rating'].tolist()[0])
                
        except ValueError:
            pass
  
    
    
#Put away players rating in match    
#for i in range(1,12):
    #result_away = pd.merge(match, player_attributes[['player_api_id','overall_rating']], left_on="away_player_"+"%s" % i, right_on='player_api_id', how='left')
    #match["away_player_"+"%s" % i] = result_away["overall_rating"]

In [None]:
# Dada uma lista de timeStamps, retorna o valor mais baixo!

def getTimeStampsMinValue(df):
    aux = df.tolist()[0]
    for i in range (0, df.size):
        if(aux > df.tolist()[i]):
            aux = df.tolist()[i]
    return aux;

In [None]:
# Dada uma lista de todas as datas, escolhe-se a data mais recente, da perspetiva da data do jogo
# isto é, escolhe-se a data mais recente, sendo que esta tem de ser menor ou igual da data do jogo

def getMostRecentMatchDate(dates, dates_length, match_date):
    aux_date = getTimeStampsMinValue(dates)
    for i in range(0,dates_length):
        if dates.tolist()[i] <= match_date:
            if dates.tolist()[i] > aux_date:
                aux_date = dates.tolist()[i]
    return aux_date    

In [None]:
print(match.iloc[0]["date"])

In [None]:
# Dada a data mais recente, vai-se buscar o respetivo "overall_rating" do jogador que se atribuir (Fora)
def getMostRecentOverallFromAwayPlayer(index, player_number):
    if pd.isna(match.iloc[index]["away_player_" + "%s" % player_number]):
            return 
    new = player_attributes["player_api_id"].isin([match.iloc[index]["away_player_" + "%s" % player_number]])
    dates_length = player_attributes[new]["date"].size
    ai = getMostRecentMatchDate(player_attributes[new]["date"], dates_length, match.iloc[index]["date"])
    player_attributes[new].loc[player_attributes["date"] == ai]
    overall_rating = player_attributes[new].loc[player_attributes["date"] == ai]["overall_rating"].tolist()[0]
    return overall_rating

In [None]:
# Dada a data mais recente, vai-se buscar o respetivo "overall_rating" do jogador que se atribuir (Casa)
def getMostRecentOverallFromHomePlayer(index, player_number):
    if pd.isna(match.iloc[index]["home_player_" + "%s" % player_number]):
            return 
    new = player_attributes["player_api_id"].isin([match.iloc[index]["home_player_" + "%s" % player_number]])
    dates_length = player_attributes[new]["date"].size
    ai = getMostRecentMatchDate(player_attributes[new]["date"], dates_length, match.iloc[index]["date"])
    player_attributes[new].loc[player_attributes["date"] == ai]
    overall_rating = player_attributes[new].loc[player_attributes["date"] == ai]["overall_rating"].tolist()[0]
    return overall_rating

In [None]:
# Dada uma lista com todas as ratings de uma equipa, para um jogo, calcula-se a media, ignorando os null values
def calculateAverageOverallRating(teamRatings):
    numberOfValidArguments = 0
    totalSum = 0
    for i in range(0,len(teamRatings)):
        if pd.isna(teamRatings[i]):
             continue
        else:
            numberOfValidArguments += 1
            totalSum += teamRatings[i]    
    return round(totalSum/numberOfValidArguments, 1)


In [None]:
def generateDataframeWithAverageOveralls():
    newListBig = []
    match_depth = match.shape[0]
    for index in range (0, 100):
    #for index in range (0, match_depth):
        newListHome = []
        newListAway = []
        for i in range(1,12):
                newListHome.append(getMostRecentOverallFromHomePlayer(index,i))
                newListAway.append(getMostRecentOverallFromAwayPlayer(index,i))

        newListBig.append([match.id.tolist()[index],calculateAverageOverallRating(newListHome) ,calculateAverageOverallRating(newListAway)])


    new_dataset = pd.DataFrame(newListBig, columns =[ "match_id","home_overall","away_overall"])
    display(new_dataset)

In [None]:
#generateDataframeWithAverageOveralls()

In [None]:
#new_dataset.to_csv("OverallAverages.csv", index=False)

In [None]:
print(match.shape[0])

In [None]:
def analyseHomePlayer(row, number, index):
    new = player["player_api_id"].isin([row["home_player_"+"%s" % number]])
    new2 = player_attributes["player_api_id"].isin(
        [row["home_player_"+"%s" % number]])

    return player_attributes[new2]['overall_rating'].tolist()[index]

def analyseAwayPlayer(row, number, index):
    new = player["player_api_id"].isin([row["away_player_"+"%s" % number]])
    new2 = player_attributes["player_api_id"].isin(
        [row["away_player_"+"%s" % number]])

    return player_attributes[new2]['overall_rating'].tolist()[index]


def analyseHomePlayers(row, index):
    sum = 0
    for i in range(1, 12):
       sum += analyseHomePlayer(row, i, index)
    return sum


def analyseAwayPlayers(row):
    sum = 0
    for i in range(1, 12):
        sum += analyseAwayPlayer(row, i, index)
    return sum


In [None]:
def analyseHomePlayersHeight(row):
    sum = 0
    for i in range(1, 12):
       new = player["player_api_id"].isin([row["home_player_"+"%s" % i]])
       sum += player[new]["height"].tolist()[0]
    return round(sum/11, 1)


def analyseAwayPlayersHeight(row):
    sum = 0
    for i in range(1, 12):
       new = player["player_api_id"].isin([row["away_player_"+"%s" % i]])
       sum += player[new]["height"].tolist()[0]
    return round(sum/11, 1)


def analyseHomePlayersWeight(row):
    sum = 0
    for i in range(1, 12):
       new = player["player_api_id"].isin([row["home_player_"+"%s" % i]])
       sum += player[new]["weight"].tolist()[0]
    return round(sum/11, 1)


def analyseAwayPlayersWeight(row):
    sum = 0
    for i in range(1, 12):
       new = player["player_api_id"].isin([row["away_player_"+"%s" % i]])
       sum += player[new]["weight"].tolist()[0]
    return round(sum/11, 1)

In [None]:
def analyseTeamOdd(row, string, index):
    #10 websites
    sum = 0
    sum += match["B365"+"%s" % string].tolist()[index]
    sum += match["BW"+"%s" % string].tolist()[index]
    sum += match["IW"+"%s" % string].tolist()[index]
    sum += match["LB"+"%s" % string].tolist()[index]
    sum += match["PS"+"%s" % string].tolist()[index]
    sum += match["WH"+"%s" % string].tolist()[index]
    sum += match["SJ"+"%s" % string].tolist()[index]
    sum += match["VC"+"%s" % string].tolist()[index]
    sum += match["GB"+"%s" % string].tolist()[index]
    sum += match["BS"+"%s" % string].tolist()[index]
    return round(sum/10, 2)

In [None]:
def analyseOffenseRates(row,index,string):
    new = team_attributes["team_api_id"].isin([row["%s" %string]])
    passing = team_attributes[new]["chanceCreationPassing"].tolist()[0]
    crossing = team_attributes[new]["chanceCreationCrossing"].tolist()[0]
    shooting = team_attributes[new]["chanceCreationShooting"].tolist()[0]
    return round((passing+shooting+crossing)/3, 1)

def analyseCenterRates(row,index,string):
    new = team_attributes["team_api_id"].isin([row["%s" % string]])
    dribling = team_attributes[new]["buildUpPlayDribbling"].tolist()[0]
    BUpassing = team_attributes[new]["buildUpPlayPassing"].tolist()[0]
    speed = team_attributes[new]["buildUpPlaySpeed"].tolist()[0]
    return round((dribling+BUpassing+speed)/3, 1)

def analyseDefenceRates(row,index,string):
    new = team_attributes["team_api_id"].isin([row["%s" % string]])
    agression = team_attributes[new]["defenceAggression"].tolist()[0]
    width = team_attributes[new]["defenceTeamWidth"].tolist()[0]
    pressure = team_attributes[new]["defencePressure"].tolist()[0]
    return round((agression+width+pressure)/3, 1)

In [None]:
def analyseDifGoals(row, index):
    return match["home_team_goal"].tolist()[index] - match["away_team_goal"].tolist()[index]

In [None]:
'''def generateDataframeWithAverageOveralls1():
    newListBig = []
    match_depth = match.shape[0]
    for index in range (0, 100):
    #for index in range (0, match_depth):
        newListHome = []
        newListAway = []
        for i in range(1,12):
                newListHome.append(getMostRecentOverallFromHomePlayer(index,i))
                newListAway.append(getMostRecentOverallFromAwayPlayer(index,i))

        newListBig.append([match.id.tolist()[index],calculateAverageOverallRating(newListHome) ,calculateAverageOverallRating(newListAway),
                            analyseDifGoals(match.iloc[index], index),
                            analyseOffenseRates(match.iloc[index],index, "home_team_api_id"),
                            analyseCenterRates(match.iloc[index],index, "home_team_api_id"),
                            analyseDefenceRates(match.iloc[index],index, "home_team_api_id"),
            
                              ])


    new_dataset = pd.DataFrame(newListBig, columns =[ "match_id","home_overall","away_overall", "dif_goals", 
                                                    'home_team_offense_rates','home_team_center_rates','home_team_defense_rates'
                                                   ])
    display(new_dataset)'''''