In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-1.2.2-cp38-cp38-macosx_10_9_x86_64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 4.3 MB/s eta 0:00:01
Collecting numpy>=1.16.5
  Downloading numpy-1.20.1-cp38-cp38-macosx_10_9_x86_64.whl (16.0 MB)
[K     |████████████████████████████████| 16.0 MB 14.2 MB/s eta 0:00:01
Installing collected packages: numpy, pandas
Successfully installed numpy-1.20.1 pandas-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd
import datetime
import warnings

warnings.filterwarnings('ignore')

Scouts

In [4]:
matches_2020_path = "data/2020/rodada-"

In [24]:
def read_all_files(path):
    # return all dataframes concated
    df_rounds = []
    # We have match within rounds from 1 to 37
    for round_number in range(1, 38):
        filename = path + "{round_number}.csv".format(round_number=round_number)
        df_rounds.append(pd.read_csv(filename))
    return pd.concat(df_rounds)

In [27]:
df_rounds = read_all_files(matches_2020_path)

In [80]:
#Remove all players that didn't play the match
df_rounds = df_rounds.dropna(subset=['FF', 'FS', 'G', 'PI', 'CA', 'FC', 'DS', 'FT', 'DD', 'GS', 'FD', 'GC','SG', 'A', 'I', 'CV', 'PP', 'DP'],
                how='all')

array(['Provável', 'Nulo', 'Contundido', 'Dúvida', 'Suspenso'],
      dtype=object)

In [56]:
df_useful_columns = df_rounds[['atletas.atleta_id', 'atletas.clube_id', 'atletas.rodada_id', 'atletas.posicao_id', 'atletas.preco_num', 'atletas.pontos_num',
                               'atletas.variacao_num', 'atletas.media_num', 'A', 'G', 'FD', 'FF', 'FT', 'PI', 'PP', 'DS', 'DD', 'DP', 'GS']]
# A coluna DS == Desarme
# A coluna PI == Passe Incompleto

In [63]:
# renaming columns acording our documentations
columns_map = {
    "atletas.atleta_id": "id_player", "atletas.clube_id": "id_team", "atletas.rodada_id": "round",
    "atletas.posicao_id": "position", "atletas.preco_num": "cartola_price", "atletas.variacao_num": "cartola_price_update",
    "atletas.pontos_num": "cartola_score", "atletas.media_num": "cartola_score_mean", "A":"goal_assistance",
    "G": "scored_goals", "FD": "saved_kicks", "FF": "wrong_kicks", "FT": "crossbar_kicks",
    "PI": "wrong_passes", "PP": "missed_penalties", "DS": "stolen_ball",
    "DD": "difficult_gk_saves", "DP": "gk_penalty_saves", "GS": "conceded_goals"
}

df_final = df_useful_columns.rename(columns=columns_map)

In [64]:
df_final.head(10)

Unnamed: 0,id_player,id_team,round,position,cartola_price,cartola_score,cartola_price_update,cartola_score_mean,goal_assistance,scored_goals,saved_kicks,wrong_kicks,crossbar_kicks,wrong_passes,missed_penalties,stolen_ball,difficult_gk_saves,gk_penalty_saves,conceded_goals
0,77544,373,1,mei,6.0,0.0,0.0,0.0,,,,,,,,,,,
1,39850,373,1,tec,2.0,0.0,0.0,0.0,,,,,,,,,,,
2,60858,373,1,mei,3.0,0.0,0.0,0.0,,,,,,,,,,,
3,70986,293,1,mei,12.9,9.8,4.9,9.8,,1.0,,1.0,,,,,,,
4,79066,294,1,gol,5.0,0.0,0.0,0.0,,,,,,,,,,,
5,101832,294,1,mei,1.23,0.7,0.23,0.7,,,,1.0,,1.0,,,,,
6,79578,294,1,ata,2.46,-2.6,-3.54,-2.6,,,,,,1.0,,,,,
7,72595,373,1,gol,4.0,0.0,0.0,0.0,,,,,,,,,,,
8,82634,294,1,mei,7.9,5.9,2.9,5.9,,,,,1.0,6.0,,3.0,,,
9,69012,293,1,gol,10.0,0.0,0.0,0.0,,,,,,,,,,,


# Matches

In [4]:
team_matches_2020_path = "data/2020/2020_partidas.csv"

In [5]:
matches_2020_df = pd.read_csv(team_matches_2020_path)

In [6]:
matches_2020_df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,round
0,2020-08-09,262,282,0.0,1.0,1
1,2020-09-30,263,265,1.0,2.0,12
2,2020-08-09,275,267,,,1
3,2020-08-09,277,280,1.0,1.0,1
4,2020-09-30,264,373,0.0,0.0,12


In [23]:
# Remove matches without scores
df_matches_without_nan_scores = matches_2020_df.dropna(subset=['home_score', 'away_score'], how='all')

In [24]:
# Convert date columns type from string to date
df_matches_without_nan_scores["date"] = df_matches_without_nan_scores.apply(lambda row: datetime.datetime.strptime(row["date"], '%Y-%m-%d'),
                                                                            axis=1)

In [25]:
# Create 'week_day' column
df_matches_without_nan_scores["week_day"] = df_matches_without_nan_scores.apply(lambda row: row["date"].weekday(), axis=1)

In [32]:
# Create has_won column
def col_has_won(row, col_score_1, col_score_2):
    
    if row[col_score_1] > row[col_score_2]:
        return True
    return False
    
df_matches_without_nan_scores["home_has_won"] = df_matches_without_nan_scores.apply(lambda row: col_has_won(row, "home_score", "away_score"), axis=1)
df_matches_without_nan_scores["away_has_won"] = df_matches_without_nan_scores.apply(lambda row: col_has_won(row, "away_score", "home_score"), axis=1)

In [36]:
# Create draw column
def col_is_draw(row):
    
    if row["home_score"] == row["away_score"]:
        return True
    return False
    
df_matches_without_nan_scores["was_draw"] = df_matches_without_nan_scores.apply(lambda row: col_is_draw(row), axis=1)

In [40]:
# Create was_home_team column
df_matches_without_nan_scores["home_was_home_team"] = True
df_matches_without_nan_scores["away_was_home_team"] = False

In [52]:
home_teams = df_matches_without_nan_scores[["home_team", "away_team", "round", "week_day", "date", "home_has_won", "was_draw",
                                            "home_score", "away_score", "home_was_home_team"]] \
                                            .rename(columns={"home_team": "id_team",
                                                             "away_team": "id_opponent_team",
                                                             "home_has_won": "has_won",
                                                             "home_score": "team_goals",
                                                             "away_score": "opponent_team_goals",
                                                             "home_was_home_team": "was_home_team"})


away_teams = df_matches_without_nan_scores[["away_team", "home_team", "round", "week_day", "date", "away_has_won", "was_draw",
                                            "away_score", "home_score", "away_was_home_team"]] \
                                            .rename(columns={"away_team": "id_team",
                                                             "home_team": "id_opponent_team",
                                                             "away_has_won": "has_won",
                                                             "away_score": "team_goals",
                                                             "home_score": "opponent_team_goals",
                                                             "away_was_home_team": "was_home_team"})


teams = pd.concat([home_teams, away_teams])