In [43]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [44]:
# Wczytanie danych
df = pd.read_csv('final_dataset_SMOTE.csv')

In [45]:
# Konwersja kolumny Date z obsługą błędów oraz sortowanie według daty
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce', dayfirst=True)
df = df.dropna(subset=['Date'])  # Usuwamy wiersze z niepoprawnymi datami
df = df.sort_values('Date')

In [46]:
# Przygotowanie danych:
# - FTR: wynik meczu (zmienna docelowa)
# - Usuwamy kolumny, które nie są dostępne przed rozpoczęciem meczu lub powodują problemy:
#    "FTHG", "FTAG", "MW", "HTFormPtsStr", "ATFormPtsStr", "Unnamed: 0" i "Date"
df_model = df.drop(columns=['Unnamed: 0', 'FTHG', 'FTAG', 'MW', 'HTFormPtsStr', 'ATFormPtsStr'])
y = df_model['FTR']
X = df_model.drop(columns=['FTR', 'Date'])

In [47]:
# Podział danych na zbiór treningowy i testowy oparty na czasie (80% najwcześniejszych, 20% najpóźniejszych)
train_size = int(0.8 * len(X))
X_train, X_test = X.iloc[:train_size].copy(), X.iloc[train_size:].copy()
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

In [48]:
# Definicja cech kategorycznych i numerycznych
categorical_features = ['HomeTeam', 'AwayTeam']
numerical_features = [col for col in X_train.columns if col not in categorical_features]

In [49]:
# Konwersja kolumn numerycznych do typu liczbowego (wartości niekonwertowalne stają się NaN)
for col in numerical_features:
    X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

In [50]:
# Pipeline przetwarzania danych:
# Wykorzystujemy SimpleImputer, aby uzupełnić brakujące wartości strategią mediany,
# a następnie standaryzujemy cechy numeryczne.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [51]:
# Budowa pipeline: preprocessing + regresja logistyczna
# Budowa pipeline: preprocessing + regresja logistyczna
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='elasticnet',
        solver='saga',
        C=0.01,
        l1_ratio=0.5,
        max_iter=1000
    ))
])
#DONE

In [52]:
# Trenowanie modelu
model.fit(X_train, y_train)

In [53]:
# Predykcja na zbiorze testowym
y_pred = model.predict(X_test)

In [54]:
# Ocena modelu
accuracy = accuracy_score(y_test, y_pred)
print("Dokładność modelu:", accuracy)
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred))

Dokładność modelu: 0.4612868047982552
Raport klasyfikacji:
               precision    recall  f1-score   support

           D       0.39      0.55      0.45       319
           H       0.53      0.58      0.55       353
          NH       0.59      0.18      0.27       245

    accuracy                           0.46       917
   macro avg       0.50      0.44      0.43       917
weighted avg       0.49      0.46      0.44       917



In [55]:
#import pickle

# Zapisz model w trybie protokołu 4 (dla kompatybilności)
#with open("model.pkl", "wb") as file:
    #pickle.dump(model, file, protocol=4)


In [56]:
from sklearn.model_selection import ParameterGrid
from sklearn.base import clone
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from copy import deepcopy

# 1. Definicja siatki parametrów
param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],
    'classifier__l1_ratio': [None, 0.1, 0.5, 0.9],
    'classifier__max_iter': [1000]
}

# 2. Filtrowanie nieprawidłowych kombinacji
valid_param_combinations = []
for params in ParameterGrid(param_grid):
    solver = params['classifier__solver']
    penalty = params['classifier__penalty']
    l1_ratio = params['classifier__l1_ratio']

    # Reguły kompatybilności parametrów
    if penalty == 'elasticnet' and solver != 'saga':
        continue
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        continue
    if penalty == 'none' and solver not in ['lbfgs', 'saga', 'newton-cg']:
        continue
    if penalty == 'l2' and solver not in ['lbfgs', 'liblinear', 'newton-cg', 'saga']:
        continue
    if penalty != 'elasticnet' and l1_ratio is not None:
        continue

    valid_param_combinations.append(params)

print(f"✔ Dozwolonych kombinacji: {len(valid_param_combinations)}")

# 3. Manualny GridSearch
results = []

for i, p in enumerate(valid_param_combinations):
    model_copy = clone(model)
    safe_params = deepcopy(p)

    # Usuń l1_ratio jeśli nie dotyczy elasticnet
    if safe_params.get('classifier__penalty') != 'elasticnet':
        safe_params.pop('classifier__l1_ratio', None)

    try:
        model_copy.set_params(**safe_params)
        model_copy.fit(X_train, y_train)
        y_pred = model_copy.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.append((safe_params, acc))
        print(f"[{i+1}/{len(valid_param_combinations)}] ACC: {acc:.4f} | {safe_params}")
    except Exception as e:
        print(f"[{i+1}/{len(valid_param_combinations)}] ❌ Błąd dla: {safe_params} \n{e}")

# 4. Posortuj wyniki
results.sort(key=lambda x: x[1], reverse=True)

# 5. Najlepszy wynik
best_params, best_acc = results[0]
print("\n🔝 Najlepsze parametry:")
print(best_params)
print(f"Dokładność: {best_acc:.4f}")

# 6. Pełny raport klasyfikacji
best_model = clone(model)
best_model.set_params(**best_params)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

print("\n📊 Raport klasyfikacji dla najlepszego modelu:")
print(classification_report(y_test, y_pred_best))



✔ Dozwolonych kombinacji: 78
[1/78] ACC: 0.3479 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
[2/78] ACC: 0.3479 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[3/78] ACC: 0.4591 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}




[4/78] ACC: 0.4460 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
[5/78] ACC: 0.4406 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
[6/78] ACC: 0.4406 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
[7/78] ❌ Błąd dla: {'classifier__C': 0.001, 'classifier__l1_ratio': None, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'} 
unsupported operand type(s) for -: 'int' and 'NoneType'




[8/78] ACC: 0.4373 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[9/78] ACC: 0.4384 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}




[10/78] ACC: 0.4384 | {'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}
[11/78] ACC: 0.4329 | {'classifier__C': 0.001, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[12/78] ACC: 0.3479 | {'classifier__C': 0.001, 'classifier__l1_ratio': 0.5, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[13/78] ACC: 0.3479 | {'classifier__C': 0.001, 'classifier__l1_ratio': 0.9, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
[14/78] ACC: 0.4493 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}




[15/78] ACC: 0.4493 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
[16/78] ACC: 0.4438 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}




[17/78] ACC: 0.4482 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
[18/78] ACC: 0.4493 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
[19/78] ACC: 0.4493 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
[20/78] ❌ Błąd dla: {'classifier__C': 0.01, 'classifier__l1_ratio': None, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'} 
unsupported operand type(s) for -: 'int' and 'NoneType'




[21/78] ACC: 0.4373 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'saga'}
[22/78] ACC: 0.4384 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'lbfgs'}




[23/78] ACC: 0.4384 | {'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'none', 'classifier__solver': 'newton-cg'}
[24/78] ACC: 0.4526 | {'classifier__C': 0.01, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}


KeyboardInterrupt: 