In [6]:
pip install pandas

Collecting pandas
  Using cached pandas-1.2.2-cp38-cp38-macosx_10_9_x86_64.whl (10.5 MB)
Collecting numpy>=1.16.5
  Using cached numpy-1.20.1-cp38-cp38-macosx_10_9_x86_64.whl (16.0 MB)
Installing collected packages: numpy, pandas
Successfully installed numpy-1.20.1 pandas-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import datetime
import warnings
import numpy as np

warnings.filterwarnings('ignore')

In [2]:
from transform import *

# Players

In [3]:
matches_2020_path = "../data/raw/2020/rodada-"

In [4]:
def read_all_files(path):
    # return all dataframes concated
    df_rounds = []
    # We have match within rounds from 1 to 37
    for round_number in range(1, 38):
        filename = path + "{round_number}.csv".format(round_number=round_number)
        df_rounds.append(pd.read_csv(filename))
    return pd.concat(df_rounds)

In [5]:
df_rounds = read_all_files(matches_2020_path)

In [6]:
#Remove all players that didn't play the match
df_rounds = df_rounds.dropna(subset=['FF', 'FS', 'G', 'PI', 'CA', 'FC', 'DS', 'FT', 'DD', 'GS', 'FD', 'GC','SG', 'A', 'I', 'CV', 'PP', 'DP'],
                how='all')

In [7]:
df_useful_columns = df_rounds[['atletas.atleta_id', 'atletas.clube_id', 'atletas.rodada_id', 'atletas.posicao_id', 'atletas.preco_num', 'atletas.pontos_num',
                               'atletas.variacao_num', 'atletas.media_num', 'A', 'G', 'FD', 'FF', 'FT', 'PI', 'PP', 'DS', 'DD', 'DP', 'GS']]
# A coluna DS == Desarme
# A coluna PI == Passe Incompleto

In [8]:
# renaming columns acording our documentations
columns_map = {
    "atletas.atleta_id": "id_player", "atletas.clube_id": "id_team", "atletas.rodada_id": "round",
    "atletas.posicao_id": "position", "atletas.preco_num": "cartola_price", "atletas.variacao_num": "cartola_price_update",
    "atletas.pontos_num": "cartola_score", "atletas.media_num": "cartola_score_mean", "A":"goal_assistance",
    "G": "scored_goals", "FD": "saved_kicks", "FF": "wrong_kicks", "FT": "crossbar_kicks",
    "PI": "wrong_passes", "PP": "missed_penalties", "DS": "stolen_ball",
    "DD": "difficult_gk_saves", "DP": "gk_penalty_saves", "GS": "conceded_goals"
}

df_players_final = df_useful_columns.rename(columns=columns_map)

In [9]:
df_players_final

Unnamed: 0,id_player,id_team,round,position,cartola_price,cartola_score,cartola_price_update,cartola_score_mean,goal_assistance,scored_goals,saved_kicks,wrong_kicks,crossbar_kicks,wrong_passes,missed_penalties,stolen_ball,difficult_gk_saves,gk_penalty_saves,conceded_goals
3,70986,293,1,mei,12.90,9.8,4.90,9.80,,1.0,,1.0,,,,,,,
5,101832,294,1,mei,1.23,0.7,0.23,0.70,,,,1.0,,1.0,,,,,
6,79578,294,1,ata,2.46,-2.6,-3.54,-2.60,,,,,,1.0,,,,,
8,82634,294,1,mei,7.90,5.9,2.90,5.90,,,,,1.0,6.0,,3.0,,,
11,38509,262,1,gol,14.21,5.6,0.21,5.60,,,,,,9.0,,,2.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,107252,290,36,lat,0.99,0.0,0.00,1.00,,,1.0,,,2.0,,,,,
795,105411,265,36,ata,0.89,0.5,-0.11,0.50,,,,,,,,,,,
797,106076,290,36,lat,3.41,6.0,-0.62,3.29,1.0,,3.0,7.0,,30.0,,25.0,,,
798,74159,285,36,ata,6.99,0.0,0.00,2.28,1.0,3.0,6.0,10.0,,40.0,,16.0,,,


# Teams

In [10]:
team_matches_2020_path = "../data/raw/2020/2020_partidas.csv"

In [11]:
matches_2020_df = pd.read_csv(team_matches_2020_path)

In [12]:
matches_2020_df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,round
0,2020-08-09,262,282,0.0,1.0,1
1,2020-09-30,263,265,1.0,2.0,12
2,2020-08-09,275,267,,,1
3,2020-08-09,277,280,1.0,1.0,1
4,2020-09-30,264,373,0.0,0.0,12


In [13]:
# Remove matches without scores
df_matches_without_nan_scores = matches_2020_df.dropna(subset=['home_score', 'away_score'], how='all')

In [14]:
# Convert date columns type from string to date
df_matches_without_nan_scores["date"] = df_matches_without_nan_scores.apply(lambda row: datetime.datetime.strptime(row["date"], '%Y-%m-%d'),
                                                                            axis=1)

In [15]:
# Create 'week_day' column
df_matches_without_nan_scores["week_day"] = df_matches_without_nan_scores.apply(lambda row: row["date"].weekday(), axis=1)

In [16]:
# Create has_won column
def col_has_won(row, col_score_1, col_score_2):
    
    if row[col_score_1] > row[col_score_2]:
        return True
    return False
    
df_matches_without_nan_scores["home_has_won"] = df_matches_without_nan_scores.apply(lambda row: col_has_won(row, "home_score", "away_score"), axis=1)
df_matches_without_nan_scores["away_has_won"] = df_matches_without_nan_scores.apply(lambda row: col_has_won(row, "away_score", "home_score"), axis=1)

In [17]:
# Create draw column
def col_is_draw(row):
    
    if row["home_score"] == row["away_score"]:
        return True
    return False
    
df_matches_without_nan_scores["was_draw"] = df_matches_without_nan_scores.apply(lambda row: col_is_draw(row), axis=1)

In [18]:
# Create was_home_team column
df_matches_without_nan_scores["home_was_home_team"] = True
df_matches_without_nan_scores["away_was_home_team"] = False

In [19]:
home_teams = df_matches_without_nan_scores[["home_team", "away_team", "round", "week_day", "date", "home_has_won", "was_draw",
                                            "home_score", "away_score", "home_was_home_team"]] \
                                            .rename(columns={"home_team": "id_team",
                                                             "away_team": "id_opponent_team",
                                                             "home_has_won": "has_won",
                                                             "home_score": "team_goals",
                                                             "away_score": "opponent_team_goals",
                                                             "home_was_home_team": "was_home_team"})


away_teams = df_matches_without_nan_scores[["away_team", "home_team", "round", "week_day", "date", "away_has_won", "was_draw",
                                            "away_score", "home_score", "away_was_home_team"]] \
                                            .rename(columns={"away_team": "id_team",
                                                             "home_team": "id_opponent_team",
                                                             "away_has_won": "has_won",
                                                             "away_score": "team_goals",
                                                             "home_score": "opponent_team_goals",
                                                             "away_was_home_team": "was_home_team"})


df_teams = pd.concat([home_teams, away_teams])

In [20]:
df_teams = apply_match_points(df_teams)
df_teams_score = apply_championship_score(df_teams)
df_teams_final = calcule_championship_position(df_teams_score)

In [21]:
df_teams_final

Unnamed: 0,id_team,id_opponent_team,round,week_day,date,has_won,was_draw,team_goals,opponent_team_goals,was_home_team,match_points,championship_score,championship_position
0,262,282,1,6,2020-08-09,False,False,0.0,1.0,True,0,0,8.0
1,263,265,12,2,2020-09-30,False,False,1.0,2.0,True,0,11,18.0
3,277,280,1,6,2020-08-09,False,True,1.0,1.0,True,1,1,6.0
4,264,373,12,2,2020-09-30,False,True,0.0,0.0,True,1,13,14.0
5,284,266,1,6,2020-08-09,True,False,1.0,0.0,True,3,3,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,282,265,17,0,2020-10-19,False,False,1.0,3.0,False,0,31,3.0
167,277,294,17,5,2020-10-17,True,False,2.0,1.0,False,3,27,5.0
168,275,356,17,6,2020-10-18,False,False,0.0,2.0,False,0,22,8.0
169,293,373,17,5,2020-10-17,False,True,1.0,1.0,False,1,16,21.0


In [22]:
players_clean_path = "../data/clean/players/"

df_players_final.to_csv(players_clean_path + "2020.csv", index=False)

In [23]:
teams_clean_path = "../data/clean/teams/"
df_teams_final.to_csv(teams_clean_path + "2020.csv", index=False)