In [1]:
import pandas as pd
import numpy as np
from scipy.stats import poisson
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [9]:
df = pd.read_csv("dataset.csv")
df['Ball_Possession_Home'] = df['Ball_Possession_Home'].str.replace("%", "").astype(float)
df['Ball_Possession_Host'] = df['Ball_Possession_Host'].str.replace("%", "").astype(float)

  df = pd.read_csv("dataset.csv")


In [10]:
features = [
    'Ball_Possession_Home',
    'Goal_Attempts_Home',
    'Shots_on_Goal_Home',
    'Shots_off_Goal_Home',
    'Blocked_Shots_Home',
    'Corner_Kicks_Home',
    'Free_Kicks_Home',
    'Dangerous_Attacks_Home',
    'Attacks_Home',
    'Goalkeeper_Saves_Home',
]

df['Shot_Accuracy_Home'] = df['Shots_on_Goal_Home'] / (df['Goal_Attempts_Home'] + 1e-3)
features.append('Shot_Accuracy_Home')
df['Shot_Accuracy_Host'] = df['Shots_on_Goal_Host'] / (df['Goal_Attempts_Host'] + 1e-3)

features_away = [f.replace('_Home', '_Host') for f in features]


In [11]:
target_home = 'expected_goals_xg_home'
target_away = 'expected_goals_xg_host'

df_clean = df.dropna(subset=[target_home, target_away])
df_clean[features + features_away] = df_clean[features + features_away].fillna(
    df_clean[features + features_away].mean()
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[features + features_away] = df_clean[features + features_away].fillna(


In [12]:
X_home = df_clean[features]
y_home = df_clean[target_home]

X_away = df_clean[features_away]
y_away = df_clean[target_away]

X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(X_home, y_home, test_size=0.2, random_state=42)
X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(X_away, y_away, test_size=0.2, random_state=42)


In [13]:
model_home = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_home.fit(X_train_home, y_train_home)

model_away = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_away.fit(X_train_away, y_train_away)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
def evaluate(model, X_test, y_test, label=""):
    preds = model.predict(X_test)
    print(f"{label} R²: {r2_score(y_test, preds):.3f}")
    print(f"{label} MAE: {mean_absolute_error(y_test, preds):.3f}")
    print(f"{label} RMSE: {np.sqrt(mean_squared_error(y_test, preds)):.3f}")
    return preds

print("⚽ Hazai csapat predikció:")
preds_home = evaluate(model_home, X_test_home, y_test_home, "Home")

print("\n🛫 Vendég csapat predikció:")
preds_away = evaluate(model_away, X_test_away, y_test_away, "Away")


⚽ Hazai csapat predikció:
Home R²: 0.504
Home MAE: 0.475
Home RMSE: 0.618

🛫 Vendég csapat predikció:
Away R²: 0.496
Away MAE: 0.423
Away RMSE: 0.537


In [18]:
# Vizsgáljunk néhány meccset Poisson alapján
sample_matches = df_clean.iloc[:5].copy()
sample_matches['xg_home_pred'] = model_home.predict(sample_matches[features])
sample_matches['xg_away_pred'] = model_away.predict(sample_matches[features_away])

max_goals = 6

for idx, row in sample_matches.iterrows():
    xg_home = row['xg_home_pred']
    xg_away = row['xg_away_pred']
    actual_home = row['home_score']
    actual_away = row['away_score']

    prob_matrix = np.zeros((max_goals+1, max_goals+1))

    for i in range(max_goals + 1):
        for j in range(max_goals + 1):
            prob_matrix[i, j] = poisson.pmf(i, xg_home) * poisson.pmf(j, xg_away)

    home_win = np.sum(np.tril(prob_matrix, -1).T)
    draw = np.sum(np.diag(prob_matrix))
    away_win = np.sum(np.triu(prob_matrix, 1))

    predicted_result = 'Draw'
    if home_win > max(draw, away_win):
        predicted_result = 'Home Win'
    elif away_win > max(draw, home_win):
        predicted_result = 'Away Win'

    actual_result = (
        'Draw' if actual_home == actual_away else
        'Home Win' if actual_home > actual_away else
        'Away Win'
    )

    print(f"Match {idx}:")
    print(f"  Predicted xG: Home={xg_home:.2f}, Away={xg_away:.2f}")
    print(f"  Poisson probs => Home: {home_win:.2f}, Draw: {draw:.2f}, Away: {away_win:.2f}")
    print(f"  Predicted result: {predicted_result}, Actual result: {actual_result}\n")


Match 0:
  Predicted xG: Home=2.02, Away=1.38
  Poisson probs => Home: 0.52, Draw: 0.22, Away: 0.26
  Predicted result: Home Win, Actual result: Home Win

Match 1:
  Predicted xG: Home=0.80, Away=0.87
  Poisson probs => Home: 0.31, Draw: 0.34, Away: 0.35
  Predicted result: Away Win, Actual result: Draw

Match 2:
  Predicted xG: Home=2.01, Away=0.75
  Poisson probs => Home: 0.66, Draw: 0.20, Away: 0.13
  Predicted result: Home Win, Actual result: Home Win

Match 3:
  Predicted xG: Home=1.75, Away=0.61
  Poisson probs => Home: 0.65, Draw: 0.23, Away: 0.12
  Predicted result: Home Win, Actual result: Home Win

Match 4:
  Predicted xG: Home=2.05, Away=1.56
  Poisson probs => Home: 0.49, Draw: 0.21, Away: 0.29
  Predicted result: Home Win, Actual result: Home Win

