#Načtení knihoven a nastavení základních parametrů pro fbref.com

Importujeme potřebné knihovny a nastavíme základní parametry pro stahování dat z fbref.com, včetně lig, sezón a sloupců.



In [None]:
import pandas as pd
import time

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

leagues = {
    "Premier League": 9,
    "Bundesliga": 20,
    "Serie A": 11,
    "La Liga": 12,
    "Ligue 1": 13
}

seasons = ["2022-2023", "2023-2024", "2024-2025"]

columns = ['Squad', 'W', 'D', 'L', 'Pts/MP', 'GD', 'xGD']

all_team_stats = []

# Stahování statistik týmů z fbref.com
Pro každou ligu a sezónu stahujeme statistiky týmů z fbref.com a ukládáme je do seznamu DataFrameů.



In [None]:
for league_name, comp_id in leagues.items():
    for season in seasons:
        url = f"https://fbref.com/en/comps/{comp_id}/{season}/{season}-{league_name.replace(' ', '-')}-Stats"
        table_id = f"results{season}{comp_id}1_overall"

        print(f"Stahuji {league_name} pro {season} z {url} (Table ID: {table_id})...")
        try:
            df = pd.read_html(url, attrs={"id": table_id})[0]
            team_stats = df[columns].copy()
            team_stats['Season'] = season
            team_stats['League'] = league_name
            all_team_stats.append(team_stats)
            print(f"Staženo {len(team_stats)} týmů pro {league_name} {season}")
        except Exception as e:
            print(f"Chyba při stahování {league_name} {season}: {e}")
        time.sleep(3)

# Kombinace a uložení statistik týmů
Kombinujeme všechny DataFrame do jednoho a ukládáme výsledky do CSV souboru.



In [None]:
combined_team_stats = pd.concat(all_team_stats, ignore_index=True)
combined_team_stats.to_csv('all_leagues_2022_2025.csv', index=False)
print("\nKombinované statistiky pro všechny ligy (2022-2023 až 2024-2025):")
print(combined_team_stats)
print(f"\nData uložena do 'all_leagues_2022_2025.csv' s {len(combined_team_stats)} řádky.")

# Načtení knihoven a nastavení parametrů pro API football-data.org
Importujeme knihovny a nastavujeme parametry pro stahování zápasů z football-data.org pomocí API.



In [None]:
import requests
import csv
import time

API_KEY = "43746d6bdd3b43de8fc5f083e002bb5f"
HEADERS = {"X-Auth-Token": API_KEY}
BASE_URL = "https://api.football-data.org/v4"

LEAGUES = {
    "PL": "Premier League",
    "BL1": "Bundesliga",
    "SA": "Serie A",
    "PD": "La Liga",
    "FL1": "Ligue 1"
}

SEASONS = [2023, 2024]

# Funkce pro stahování a ukládání zápasů
Definujeme funkce pro stahování zápasů z API a jejich uložení do CSV souboru.



In [None]:
def fetch_matches(competition_code, season):
    url = f"{BASE_URL}/competitions/{competition_code}/matches?season={season}&status=FINISHED"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        matches = response.json().get("matches", [])
        print(f"Staženo {len(matches)} zápasů pro {competition_code} {season}")
        return matches
    else:
        print(f"Chyba při stahování {competition_code} {season}: {response.status_code}")
        return []

def save_to_csv(matches, filename="matches_2023_2024.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Date", "League", "Home Team", "Away Team", "Result"])

        for match in matches:
            date = match["utcDate"]
            home_team = match["homeTeam"]["name"]
            away_team = match["awayTeam"]["name"]
            winner = match["score"]["winner"]
            league = match["competition"]["name"]
            result = {
                "HOME_TEAM": f"{home_team} Win",
                "AWAY_TEAM": f"{away_team} Win",
                "DRAW": "Draw"
            }.get(winner, "Unknown")

            writer.writerow([date, league, home_team, away_team, result])

# Hlavní funkce pro stahování zápasů
Spouštíme proces stahování zápasů pro všechny ligy a sezóny a ukládáme je.



In [None]:
def main():
    all_matches = []
    for season in SEASONS:
        for code, name in LEAGUES.items():
            print(f"Stahuji {name} pro {season}...")
            matches = fetch_matches(code, season)
            all_matches.extend(matches)
            time.sleep(6)

    save_to_csv(all_matches)
    print(f"Uloženo {len(all_matches)} zápasů do matches_2023_2024.csv")

if __name__ == "__main__":
    main()

# Načtení a příprava dat pro sloučení
Načítáme oba datové soubory a připravujeme je pro sloučení, včetně přidání sezóny k zápasům.



In [None]:
import pandas as pd
import re
from datetime import datetime

matches_df = pd.read_csv('matches_2023_2024.csv')
team_stats_df = pd.read_csv('all_leagues_2022_2025.csv')

def get_season_from_date(date_str):
    date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
    year = date.year
    month = date.month
    if month >= 8:
        return f"{year}-{year + 1}"
    else:
        return f"{year - 1}-{year}"

matches_df['Season'] = matches_df['Date'].apply(get_season_from_date)

# Standardizace názvů týmů
Definujeme funkci pro porovnání názvů týmů a standardizujeme je pro sloučení.


In [None]:
def find_team_match(match_team, stats_teams):
    if pd.isna(match_team):
        return None
    match_team_lower = match_team.lower()
    for stats_team in stats_teams:
        stats_team_lower = stats_team.lower()
        if (re.search(re.escape(stats_team_lower), match_team_lower) or
                re.search(re.escape(match_team_lower), stats_team_lower)):
            return stats_team
    return None

for league in matches_df['League'].unique():
    for season in matches_df['Season'].unique():
        match_teams = matches_df[(matches_df['League'] == league) &
                                 (matches_df['Season'] == season)][['Home Team', 'Away Team']].stack().unique()
        stats_teams = team_stats_df[(team_stats_df['League'] == league) &
                                    (team_stats_df['Season'] == season)]['Squad'].unique()

        temp_mapping = {}
        unmapped = []
        for match_team in match_teams:
            matched_team = find_team_match(match_team, stats_teams)
            if matched_team:
                temp_mapping[match_team] = matched_team
            else:
                unmapped.append(match_team)

        mask = (matches_df['League'] == league) & (matches_df['Season'] == season)
        matches_df.loc[mask, 'Home Team'] = matches_df.loc[mask, 'Home Team'].map(temp_mapping)
        matches_df.loc[mask, 'Away Team'] = matches_df.loc[mask, 'Away Team'].map(temp_mapping)

        if unmapped:
            print(f"Nenamapované týmy v {league} {season}: {unmapped}")

# Sloučení dat a přejmenování sloupců
Sloučíme statistiky týmů s daty o zápasech a přejmenujeme sloupce pro přehlednost.



In [None]:
matches_df = matches_df.dropna(subset=['Home Team', 'Away Team'])

merged_home = matches_df.merge(
    team_stats_df,
    how='left',
    left_on=['Home Team', 'League', 'Season'],
    right_on=['Squad', 'League', 'Season'],
    suffixes=('', '_home')
).drop(columns=['Squad'])

merged_final = merged_home.merge(
    team_stats_df,
    how='left',
    left_on=['Away Team', 'League', 'Season'],
    right_on=['Squad', 'League', 'Season'],
    suffixes=('_home', '_away')
).drop(columns=['Squad'])


merged_final = merged_final.rename(columns={
    'W_home': 'Home_Wins',
    'D_home': 'Home_Draws',
    'L_home': 'Home_Losses',
    'Pts/MP_home': 'Home_PtsPerMatch',
    'GD_home': 'Home_GoalDiff',
    'xGD_home': 'Home_xGD',
    'W_away': 'Away_Wins',
    'D_away': 'Away_Draws',
    'L_away': 'Away_Losses',
    'Pts/MP_away': 'Away_PtsPerMatch',
    'GD_away': 'Away_GoalDiff',
    'xGD_away': 'Away_xGD'
})


final_columns = [
    'Date', 'League', 'Season', 'Home Team', 'Away Team', 'Result',
    'Home_Wins', 'Home_Draws', 'Home_Losses', 'Home_PtsPerMatch', 'Home_GoalDiff', 'Home_xGD',
    'Away_Wins', 'Away_Draws', 'Away_Losses', 'Away_PtsPerMatch', 'Away_GoalDiff', 'Away_xGD'
]
training_df = merged_final[final_columns]

# Standardizace výsledků a uložení trénovacích dat
Standardizujeme hodnoty výsledků a ukládáme finální trénovací data do CSV.



In [None]:
def standardize_result(row):
    result = row['Result']
    home_team = row['Home Team']
    away_team = row['Away Team']

    if result == "Draw":
        return "Draw"
    if "Borussia Mönchengladbach" in result and home_team == "Gladbach":
        return "Home Win"
    if "Borussia Mönchengladbach" in result and away_team == "Gladbach":
        return "Away Win"
    if home_team in result:
        return "Home Win"
    if away_team in result:
        return "Away Win"
    return result  # Fallback pro ladění

training_df['Result'] = training_df.apply(standardize_result, axis=1)

unmapped_results = training_df[~training_df['Result'].isin(['Draw', 'Home Win', 'Away Win'])]['Result']
if not unmapped_results.empty:
    print("Nenamapované hodnoty Result nalezeny:")
    print(unmapped_results.unique())

training_df Bede.csv('training_data_2023_2024.csv', index=False)
print("Unikátní hodnoty Result po standardizaci:")
print(training_df['Result'].unique())

print("Náhled sloučeného datasetu:")
print(training_df.head())
print(f"\nUloženo {len(training_df)} řádků do 'training_data_2023_2024.csv'")

# Načtení knihoven a dat pro trénování modelů
Importujeme knihovny a načítáme trénovací data pro machine learning.



In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from xgboost import XGBClassifier
from sklearn.svm import SVC

df = pd.read_csv('training_data_2023_2024.csv')

print("Unikátní hodnoty Result:")
print(df['Result'].unique())

# Příprava dat pro trénování
Kódujeme cílovou proměnnou, vybíráme funkce a rozdělujeme data na trénovací a testovací sadu.



In [None]:
result_mapping = {
    "Draw": 0,
    "Home Win": 1,
    "Away Win": 2
}
df['Result_Label'] = df['Result'].map(result_mapping)

print("Unikátní hodnoty Result_Label:")
print(df['Result_Label'].unique())

if df['Result_Label'].isna().any():
    raise ValueError("Result_Label obsahuje NaN hodnoty. Zkontrolujte mapování sloupce Result.")

feature_cols = [
    'Home_Wins', 'Home_Draws', 'Home_Losses', 'Home_PtsPerMatch', 'Home_GoalDiff', 'Home_xGD',
    'Away_Wins', 'Away_Draws', 'Away_Losses', 'Away_PtsPerMatch', 'Away_GoalDiff', 'Away_xGD'
]
X = df[feature_cols].fillna(0)
y = df['Result_Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Trénování a vyhodnocení modelů
Trénujeme a vyhodnocujeme pět různých modelů: Logistickou regresi, Random Forest, Neuronovou síť, XGBoost a SVM.



In [None]:
# Model 1: Logistická regrese
print("Trénuji Logistickou regresi...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Přesnost Logistické regrese: {lr_accuracy:.4f}")
print("Klasifikační zpráva:")
print(classification_report(y_test, lr_pred, target_names=['Draw', 'Home Win', 'Away Win']))

# Model 2: Random Forest Classifier
print("\nTrénuji Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Přesnost Random Forest: {rf_accuracy:.4f}")
print("Klasifikační zpráva:")
print(classification_report(y_test, rf_pred, target_names=['Draw', 'Home Win', 'Away Win']))

# Model 3: Neuronová síť
print("\nTrénuji Neuronovou síť...")
nn_model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])
nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
nn_loss, nn_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=0)
nn_pred = np.argmax(nn_model.predict(X_test_scaled, verbose=0), axis=1)
print(f"Přesnost Neuronové sítě: {nn_accuracy:.4f}")
print("Klasifikační zpráva:")
print(classification_report(y_test, nn_pred, target_names=['Draw', 'Home Win', 'Away Win']))

# Model 4: XGBoost
print("\nTrénuji XGBoost Classifier...")
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print(f"Přesnost XGBoost: {xgb_accuracy:.4f}")
print("Klasifikační zpráva:")
print(classification_report(y_test, xgb_pred, target_names=['Draw', 'Home Win', 'Away Win']))

# Model 5: SVM
print("\nTrénuji Support Vector Machine...")
svm_model = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"Přesnost SVM: {svm_accuracy:.4f}")
print("Klasifikační zpráva:")
print(classification_report(y_test, svm_pred, target_names=['Draw', 'Home Win', 'Away Win']))

# Shrnutí a uložení modelu
Shrnutí přesnosti všech modelů a uložení SVM modelu.



In [None]:
print("\nPorovnání modelů:")
print(f"Přesnost Logistické regrese: {lr_accuracy:.4f}")
print(f"Přesnost Random Forest: {rf_accuracy:.4f}")
print(f"Přesnost Neuronové sítě: {nn_accuracy:.4f}")
print(f"Přesnost XGBoost: {xgb_accuracy:.4f}")
print(f"Přesnost SVM: {svm_accuracy:.4f}")

scaler_filename = 'scaler.pkl'
joblib.dump(scaler, scaler_filename)
print(f"Scaler saved to {scaler_filename}")
svm_filename = 'svm_model.pkl'
joblib.dump(svm_model, svm_filename)
print(f"SVM model saved to {svm_filename}")