In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [6]:
# Carregar dataset
df = pd.read_csv("understat_per_game.csv")

# Separar jogos de casa e fora
home = df[df["h_a"] == "h"].copy()
away = df[df["h_a"] == "a"].copy()


In [7]:
# Juntar os dois lados da partida
matches = home.merge(
    away,
    on=["date", "league", "year"],
    suffixes=("_home", "_away")
)

print("Formato do dataframe final:", matches.shape)
matches.head()


Formato do dataframe final: (32762, 55)


Unnamed: 0,league,year,h_a_home,xG_home,xGA_home,npxG_home,npxGA_home,deep_home,deep_allowed_home,scored_home,...,ppda_coef_away,ppda_att_away,ppda_def_away,oppda_coef_away,oppda_att_away,oppda_def_away,team_away,xG_diff_away,xGA_diff_away,xpts_diff_away
0,Bundesliga,2014,h,2.57012,1.19842,2.57012,1.19842,5,4,2,...,21.85,437,20,9.625,231,24,Wolfsburg,0.19842,0.57012,0.47
1,Bundesliga,2014,h,1.22987,0.310166,1.22987,0.310166,13,3,2,...,7.954545,175,22,4.243902,174,41,Mainz 05,-0.63357,0.71631,-1.1342
2,Bundesliga,2014,h,1.22987,0.310166,1.22987,0.310166,13,3,2,...,12.2,183,15,6.307692,246,39,Wolfsburg,0.63553,0.15845,0.7179
3,Bundesliga,2014,h,1.22987,0.310166,1.22987,0.310166,13,3,2,...,16.961538,441,26,5.060606,167,33,VfB Stuttgart,0.310166,-0.77013,0.5502
4,Bundesliga,2014,h,1.22987,0.310166,1.22987,0.310166,13,3,2,...,6.969697,230,33,13.4,335,25,FC Cologne,0.386458,0.327136,0.3469


In [8]:
# Alvos: gols do mandante e visitante
y_home = matches["scored_home"]
y_away = matches["scored_away"]

# Remover colunas que não podem ser usadas como features
drop_cols = [
    "scored_home", "missed_home", "result_home", "team_home",
    "scored_away", "missed_away", "result_away", "team_away",
    "date", "league", "year", "h_a_home", "h_a_away"
]

X = matches.drop(columns=drop_cols)
X.head()


Unnamed: 0,xG_home,xGA_home,npxG_home,npxGA_home,deep_home,deep_allowed_home,xpts_home,wins_home,draws_home,loses_home,...,npxGD_away,ppda_coef_away,ppda_att_away,ppda_def_away,oppda_coef_away,oppda_att_away,oppda_def_away,xG_diff_away,xGA_diff_away,xpts_diff_away
0,2.57012,1.19842,2.57012,1.19842,5,4,2.3486,1,0,0,...,-1.3717,21.85,437,20,9.625,231,24,0.19842,0.57012,0.47
1,1.22987,0.310166,1.22987,0.310166,13,3,2.1588,1,0,0,...,1.407896,7.954545,175,22,4.243902,174,41,-0.63357,0.71631,-1.1342
2,1.22987,0.310166,1.22987,0.310166,13,3,2.1588,1,0,0,...,0.47708,12.2,183,15,6.307692,246,39,0.63553,0.15845,0.7179
3,1.22987,0.310166,1.22987,0.310166,13,3,2.1588,1,0,0,...,-0.919704,16.961538,441,26,5.060606,167,33,0.310166,-0.77013,0.5502
4,1.22987,0.310166,1.22987,0.310166,13,3,2.1588,1,0,0,...,0.059322,6.969697,230,33,13.4,335,25,0.386458,0.327136,0.3469


In [9]:
# Dividir em treino e teste
X_train, X_test, y_home_train, y_home_test = train_test_split(X, y_home, test_size=0.2, shuffle=False)
_, _, y_away_train, y_away_test = train_test_split(X, y_away, test_size=0.2, shuffle=False)

print("Treino:", X_train.shape, " | Teste:", X_test.shape)


Treino: (26209, 42)  | Teste: (6553, 42)


In [10]:
# Modelo para gols da casa
model_home = RandomForestRegressor(n_estimators=200, random_state=42)
model_home.fit(X_train, y_home_train)

# Modelo para gols do visitante
model_away = RandomForestRegressor(n_estimators=200, random_state=42)
model_away.fit(X_train, y_away_train)


0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
# Previsões (arredondadas para inteiros)
pred_home = model_home.predict(X_test).round().astype(int)
pred_away = model_away.predict(X_test).round().astype(int)

# Tabela organizada com times e placares
placares = pd.DataFrame({
    "Jogo": [f"{home} vs {away}" for home, away in zip(matches.loc[X_test.index, "team_home"], 
    matches.loc[X_test.index, "team_away"])],
    "Placar Real": [f"{h} x {a}" for h, a in zip(y_home_test.values, y_away_test.values)],
    "Placar Previsto": [f"{h} x {a}" for h, a in zip(pred_home, pred_away)]
})

placares.head(15)  # Mostra os 15 primeiros jogos


Unnamed: 0,Jogo,Placar Real,Placar Previsto
0,Arsenal Tula vs FK Akhmat,1 x 3,1 x 3
1,Arsenal Tula vs Zenit St. Petersburg,1 x 2,1 x 2
2,Arsenal Tula vs Krylya Sovetov Samara,2 x 4,2 x 4
3,Arsenal Tula vs FC Tambov,2 x 1,2 x 1
4,PFC Sochi vs Zenit St. Petersburg,0 x 2,0 x 2
5,PFC Sochi vs FC Ufa,0 x 0,0 x 0
6,PFC Sochi vs Lokomotiv Moscow,0 x 1,0 x 1
7,PFC Sochi vs FK Akhmat,2 x 0,2 x 0
8,PFC Sochi vs Krylya Sovetov Samara,0 x 2,0 x 2
9,PFC Sochi vs FC Tambov,1 x 2,1 x 2


In [12]:
# Erro médio absoluto
mae_home = mean_absolute_error(y_home_test, pred_home)
mae_away = mean_absolute_error(y_away_test, pred_away)

print(f"Erro médio - Gols Casa: {mae_home:.2f}")
print(f"Erro médio - Gols Fora: {mae_away:.2f}")

# Taxa de acerto do placar exato
exact_hits = ((pred_home == y_home_test.values) & (pred_away == y_away_test.values)).mean()
print(f"Taxa de acerto do placar exato: {exact_hits*100:.2f}%")


Erro médio - Gols Casa: 0.00
Erro médio - Gols Fora: 0.00
Taxa de acerto do placar exato: 99.89%
