In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# 1. Wczytanie i wstępna obróbka danych
df = pd.read_csv('DATASET_V4.csv', parse_dates=['Date'])
df.columns = [col.strip().strip(',') for col in df.columns]
df.sort_values('Date', inplace=True)

# 2. Feature Engineering – podstawowe cechy na bazie historycznych wyników

# Funkcja obliczająca formę drużyny – średnia punktów z ostatnich 5 meczów
def compute_team_form(data, team, current_date, window=5):
    past_matches = data[((data['HomeTeam'] == team) | (data['AwayTeam'] == team)) & (data['Date'] < current_date)]
    past_matches = past_matches.tail(window)
    if past_matches.empty:
        return 0
    def points(row):
        if row['FTR'] == 'D':
            return 1
        elif (row['FTR'] == 'H' and row['HomeTeam'] == team) or (row['FTR'] == 'A' and row['AwayTeam'] == team):
            return 3
        else:
            return 0
    return past_matches.apply(points, axis=1).mean()

# Funkcja obliczająca statystyki head-to-head (różnica zwycięstw gospodarzy w bezpośrednich spotkaniach)
def compute_h2h(data, home_team, away_team, current_date):
    past_matches = data[
        (((data['HomeTeam'] == home_team) & (data['AwayTeam'] == away_team)) |
         ((data['HomeTeam'] == away_team) & (data['AwayTeam'] == home_team))) &
        (data['Date'] < current_date)
    ]
    if past_matches.empty:
        return 0
    home_wins = past_matches[
        ((past_matches['HomeTeam'] == home_team) & (past_matches['FTR'] == 'H')) |
        ((past_matches['AwayTeam'] == home_team) & (past_matches['FTR'] == 'A'))
    ].shape[0]
    home_losses = past_matches[
        ((past_matches['HomeTeam'] == home_team) & (past_matches['FTR'] == 'A')) |
        ((past_matches['AwayTeam'] == home_team) & (past_matches['FTR'] == 'H'))
    ].shape[0]
    return home_wins - home_losses

# Inicjalizacja rankingów ELO – każdy zespół zaczyna z 1500 punktów
teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
elo_ratings = {team: 1500 for team in teams}

# Listy do zapamiętania obliczonych wartości
home_elo_list = []
away_elo_list = []
home_form_list = []
away_form_list = []
h2h_list = []

# Iterujemy po meczach w kolejności chronologicznej (by nie "widzieć" bieżących wyników)
for idx, row in df.iterrows():
    current_date = row['Date']
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    
    # Obliczanie formy drużyny
    home_form = compute_team_form(df, home_team, current_date, window=5)
    away_form = compute_team_form(df, away_team, current_date, window=5)
    home_form_list.append(home_form)
    away_form_list.append(away_form)
    
    # Obliczanie statystyk head-to-head
    h2h_val = compute_h2h(df, home_team, away_team, current_date)
    h2h_list.append(h2h_val)
    
    # Zapamiętanie bieżących rankingów ELO przed meczem
    home_elo_list.append(elo_ratings[home_team])
    away_elo_list.append(elo_ratings[away_team])
    
    # Aktualizacja rankingów ELO – przyjmujemy: 1 (gospodarze), 0.5 (remis), 0 (goście)
    if row['FTR'] == 'H':
        result = 1
    elif row['FTR'] == 'D':
        result = 0.5
    else:
        result = 0
    expected_home = 1 / (1 + 10 ** ((elo_ratings[away_team] - elo_ratings[home_team]) / 400))
    k = 20
    elo_ratings[home_team] = elo_ratings[home_team] + k * (result - expected_home)
    elo_ratings[away_team] = elo_ratings[away_team] + k * ((1 - result) - (1 - expected_home))

df['HomeTeam_Form'] = home_form_list
df['AwayTeam_Form'] = away_form_list
df['H2H_net'] = h2h_list
df['Home_ELO'] = home_elo_list
df['Away_ELO'] = away_elo_list

# Zakodowanie wyniku meczu (FTR) na etykietę liczbową
le = LabelEncoder()
df['FTR_encoded'] = le.fit_transform(df['FTR'])

# 2.1. Dodanie nowych cech – różnice między drużynami
df['ELO_diff'] = df['Home_ELO'] - df['Away_ELO']
df['Form_diff'] = df['HomeTeam_Form'] - df['AwayTeam_Form']
df['B365_diff'] = df['B365H'] - df['B365A']   # różnica kursów dla Bet365
df['BWH_diff'] = df['BWH'] - df['BWA']         # różnica kursów dla Bet&Win

# 3. Przygotowanie zbioru do modelowania
# Usuwamy kolumny, które mogą powodować data leakage (np. statystyki meczu)
cols_to_drop = ['FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF']
df_model = df.drop(columns=cols_to_drop)

# Wybieramy cechy przedmeczowe wraz z nowymi cechami
features_model = ['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA',
                  'HomeTeam_Form', 'AwayTeam_Form', 'H2H_net',
                  'Home_ELO', 'Away_ELO', 'ELO_diff', 'Form_diff', 'B365_diff', 'BWH_diff']

# Podział na zbiór treningowy i walidacyjny – np. przed i po 1 stycznia 2018
cutoff_date = pd.to_datetime('2018-01-01')
train_data = df_model[df_model['Date'] < cutoff_date].copy()
val_data = df_model[df_model['Date'] >= cutoff_date].copy()

X_train = train_data[features_model]
y_train = train_data['FTR_encoded']
X_val = val_data[features_model]
y_val = val_data['FTR_encoded']

# 4. Skalowanie cech
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 5. Model – regresja logistyczna z GridSearchCV dla optymalizacji hiperparametru C
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_scaled, y_train)

print("Najlepsze parametry:", grid.best_params_)
best_lr = grid.best_estimator_
y_pred = best_lr.predict(X_val_scaled)
acc = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", acc)


Najlepsze parametry: {'C': 0.01}
Validation Accuracy: 0.5349087003222341
