In [32]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
# Wczytanie danych
df = pd.read_csv('final_dataset_undersampled_equal.csv')

In [34]:
# Konwersja kolumny Date z obsługą błędów oraz sortowanie według daty
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce', dayfirst=True)
df = df.dropna(subset=['Date'])  # Usuwamy wiersze z niepoprawnymi datami
df = df.sort_values('Date')

In [35]:
# Przygotowanie danych:
# - FTR: wynik meczu (zmienna docelowa)
# - Usuwamy kolumny, które nie są dostępne przed rozpoczęciem meczu lub powodują problemy:
#    "FTHG", "FTAG", "MW", "HTFormPtsStr", "ATFormPtsStr", "Unnamed: 0" i "Date"
df_model = df.drop(columns=['FTHG', 'FTAG', 'MW', 'HTFormPtsStr', 'ATFormPtsStr'])
y = df_model['FTR']
X = df_model.drop(columns=['FTR', 'Date'])

In [36]:
# Podział danych na zbiór treningowy i testowy oparty na czasie (80% najwcześniejszych, 20% najpóźniejszych)
train_size = int(0.8 * len(X))
X_train, X_test = X.iloc[:train_size].copy(), X.iloc[train_size:].copy()
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

In [37]:
# Definicja cech kategorycznych i numerycznych
categorical_features = ['HomeTeam', 'AwayTeam']
numerical_features = [col for col in X_train.columns if col not in categorical_features]

In [38]:
# Konwersja kolumn numerycznych do typu liczbowego (wartości niekonwertowalne stają się NaN)
for col in numerical_features:
    X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

In [39]:
# Pipeline przetwarzania danych:
# Wykorzystujemy SimpleImputer, aby uzupełnić brakujące wartości strategią mediany,
# a następnie standaryzujemy cechy numeryczne.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [40]:
# Budowa pipeline: preprocessing + regresja logistyczna
# Budowa pipeline: preprocessing + regresja logistyczna
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l1',
        solver='liblinear',
        C=0.01,
        max_iter=1000
    ))
])

#DONE

In [41]:
# Trenowanie modelu
model.fit(X_train, y_train)

In [42]:
# Predykcja na zbiorze testowym
y_pred = model.predict(X_test)

In [43]:
# Ocena modelu
accuracy = accuracy_score(y_test, y_pred)
print("Dokładność modelu:", accuracy)
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred))

Dokładność modelu: 0.4756756756756757
Raport klasyfikacji:
               precision    recall  f1-score   support

           A       0.57      0.53      0.55       198
           D       0.38      0.32      0.35       167
           H       0.46      0.56      0.50       190

    accuracy                           0.48       555
   macro avg       0.47      0.47      0.47       555
weighted avg       0.47      0.48      0.47       555



In [44]:
#import pickle

# Zapisz model w trybie protokołu 4 (dla kompatybilności)
#with open("model.pkl", "wb") as file:
    #pickle.dump(model, file, protocol=4)


In [45]:
from sklearn.model_selection import ParameterGrid
from sklearn.base import clone
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from copy import deepcopy

# 1. Definicja siatki parametrów
param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],
    'classifier__l1_ratio': [None, 0.1, 0.5, 0.9],
    'classifier__max_iter': [1000]
}

# 2. Filtrowanie nieprawidłowych kombinacji
valid_param_combinations = []
for params in ParameterGrid(param_grid):
    solver = params['classifier__solver']
    penalty = params['classifier__penalty']
    l1_ratio = params['classifier__l1_ratio']

    # Reguły kompatybilności parametrów
    if penalty == 'elasticnet' and solver != 'saga':
        continue
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        continue
    if penalty == 'none' and solver not in ['lbfgs', 'saga', 'newton-cg']:
        continue
    if penalty == 'l2' and solver not in ['lbfgs', 'liblinear', 'newton-cg', 'saga']:
        continue
    if penalty != 'elasticnet' and l1_ratio is not None:
        continue

    valid_param_combinations.append(params)

print(f"✔ Dozwolonych kombinacji: {len(valid_param_combinations)}")

# 3. Manualny GridSearch
results = []

for i, p in enumerate(valid_param_combinations):
    model_copy = clone(model)
    safe_params = deepcopy(p)

    # Usuń l1_ratio jeśli nie dotyczy elasticnet
    if safe_params.get('classifier__penalty') != 'elasticnet':
        safe_params.pop('classifier__l1_ratio', None)

    try:
        model_copy.set_params(**safe_params)
        model_copy.fit(X_train, y_train)
        y_pred = model_copy.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.append((safe_params, acc))
        print(f"[{i+1}/{len(valid_param_combinations)}] ACC: {acc:.4f} | {safe_params}")
    except Exception as e:
        print(f"[{i+1}/{len(valid_param_combinations)}] ❌ Błąd dla: {safe_params} \n{e}")

# 4. Posortuj wyniki
results.sort(key=lambda x: x[1], reverse=True)

# 5. Najlepszy wynik
best_params, best_acc = results[0]
print("\n🔝 Najlepsze parametry:")
print(best_params)
print(f"Dokładność: {best_acc:.4f}")

# 6. Pełny raport klasyfikacji
best_model = clone(model)
best_model.set_params(**best_params)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

print("\n📊 Raport klasyfikacji dla najlepszego modelu:")
print(classification_report(y_test, y_pred_best))



✔ Dozwolonych kombinacji: 78
[1/78] ACC: 0.3568 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
[2/78] ACC: 0.3423 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[3/78] ACC: 0.4577 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
[4/78] ACC: 0.4577 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
[5/78] ACC: 0.4577 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
[6/78] ACC: 0.4577 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
[7/78] ❌ Błąd dla: {'classifier__C': 0.001, 'classifier__l1_ratio': None, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'cla



[8/78] ACC: 0.4559 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[9/78] ACC: 0.4559 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}
[10/78] ACC: 0.4559 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}




[11/78] ACC: 0.4505 | {'classifier__C': 0.001, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[12/78] ACC: 0.3009 | {'classifier__C': 0.001, 'classifier__l1_ratio': 0.5, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[13/78] ACC: 0.3009 | {'classifier__C': 0.001, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[14/78] ACC: 0.4757 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
[15/78] ACC: 0.4577 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[16/78] ACC: 0.4649 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
[17/78] ACC: 0.4613 | {'classifier__C': 0.01, 'classifier__max_ite



[21/78] ACC: 0.4559 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[22/78] ACC: 0.4559 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}
[23/78] ACC: 0.4559 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}




[24/78] ACC: 0.4631 | {'classifier__C': 0.01, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[25/78] ACC: 0.4667 | {'classifier__C': 0.01, 'classifier__l1_ratio': 0.5, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[26/78] ACC: 0.4523 | {'classifier__C': 0.01, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[27/78] ACC: 0.4703 | {'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
[28/78] ACC: 0.4505 | {'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[29/78] ACC: 0.4631 | {'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
[30/78] ACC: 0.4613 | {'classifier__C': 0.1, 'classifier__max_iter': 100



[34/78] ACC: 0.4559 | {'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[35/78] ACC: 0.4559 | {'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}
[36/78] ACC: 0.4559 | {'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}




[37/78] ACC: 0.4595 | {'classifier__C': 0.1, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[38/78] ACC: 0.4613 | {'classifier__C': 0.1, 'classifier__l1_ratio': 0.5, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[39/78] ACC: 0.4486 | {'classifier__C': 0.1, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[40/78] ACC: 0.4523 | {'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
[41/78] ACC: 0.4541 | {'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[42/78] ACC: 0.4541 | {'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
[43/78] ACC: 0.4595 | {'classifier__C': 1, 'classifier__max_iter': 1000, 'classif



[47/78] ACC: 0.4559 | {'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[48/78] ACC: 0.4559 | {'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}




[49/78] ACC: 0.4559 | {'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}
[50/78] ACC: 0.4577 | {'classifier__C': 1, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[51/78] ACC: 0.4523 | {'classifier__C': 1, 'classifier__l1_ratio': 0.5, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[52/78] ACC: 0.4523 | {'classifier__C': 1, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[53/78] ACC: 0.4559 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}




[54/78] ACC: 0.4559 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[55/78] ACC: 0.4577 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
[56/78] ACC: 0.4577 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
[57/78] ACC: 0.4577 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
[58/78] ACC: 0.4577 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
[59/78] ❌ Błąd dla: {'classifier__C': 10, 'classifier__l1_ratio': None, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'} 
unsupported operand type(s) for -: 'int' and 'NoneType'




[60/78] ACC: 0.4559 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[61/78] ACC: 0.4559 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}
[62/78] ACC: 0.4559 | {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}




[63/78] ACC: 0.4577 | {'classifier__C': 10, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[64/78] ACC: 0.4577 | {'classifier__C': 10, 'classifier__l1_ratio': 0.5, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}




[65/78] ACC: 0.4559 | {'classifier__C': 10, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[66/78] ACC: 0.4523 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
[67/78] ACC: 0.4577 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[68/78] ACC: 0.4523 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
[69/78] ACC: 0.4577 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
[70/78] ACC: 0.4577 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
[71/78] ACC: 0.4577 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
[72/78] ❌ 



[73/78] ACC: 0.4559 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[74/78] ACC: 0.4559 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}




[75/78] ACC: 0.4559 | {'classifier__C': 100, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}
[76/78] ACC: 0.4577 | {'classifier__C': 100, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[77/78] ACC: 0.4577 | {'classifier__C': 100, 'classifier__l1_ratio': 0.5, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[78/78] ACC: 0.4577 | {'classifier__C': 100, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}

🔝 Najlepsze parametry:
{'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Dokładność: 0.4757

📊 Raport klasyfikacji dla najlepszego modelu:
              precision    recall  f1-score   support

           A       0.57      0.53      0.55       198
           D       0.38      