In [185]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV



In [186]:
df = pd.read_csv('data/result.csv')

df.columns

Index(['is_home_win', 'home_form_5', 'away_form_5', 'home_form_10',
       'away_form_10', 'home_form_15', 'away_form_15',
       'last_home_matches_count', 'last_away_matches_count', 'days_off_home',
       'days_off_away', 'team_abbreviation_home', 'team_abbreviation_away',
       'is_last_season_champion_home', 'is_last_season_champion_away',
       'is_regular_season', 'is_playoffs', 'match_number_season_home',
       'match_number_season_away', 'home_over_away_wins_diff_5',
       'home_over_away_wins_diff_10'],
      dtype='object')

In [187]:
df = df[['is_home_win', 'home_form_5', 'away_form_5', 'home_form_15', 'away_form_15', 'last_home_matches_count', 'last_away_matches_count', 'days_off_home', 'days_off_away', 'is_last_season_champion_home', 'is_last_season_champion_away', 'is_playoffs', 'home_over_away_wins_diff_10']]

In [188]:
df.columns

Index(['is_home_win', 'home_form_5', 'away_form_5', 'home_form_15',
       'away_form_15', 'last_home_matches_count', 'last_away_matches_count',
       'days_off_home', 'days_off_away', 'is_last_season_champion_home',
       'is_last_season_champion_away', 'is_playoffs',
       'home_over_away_wins_diff_10'],
      dtype='object')

In [189]:
df.describe()

Unnamed: 0,is_home_win,home_form_5,away_form_5,home_form_15,away_form_15,last_home_matches_count,last_away_matches_count,days_off_home,days_off_away,is_last_season_champion_home,is_last_season_champion_away,is_playoffs,home_over_away_wins_diff_10
count,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0,13720.0
mean,0.583382,2.552405,2.556633,7.445773,7.441181,1.024052,1.00379,2.517784,2.4957,0.03863,0.037391,0.072668,-0.020554
std,0.493016,1.138564,1.129749,2.178988,2.156237,1.47556,1.412558,1.465189,1.428617,0.192718,0.189724,0.2596,2.583149
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-10.0
25%,0.0,2.0,2.0,6.0,6.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,-2.0
50%,1.0,3.0,3.0,8.0,8.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
75%,1.0,3.0,3.0,9.0,9.0,2.0,2.0,3.0,3.0,0.0,0.0,0.0,2.0
max,1.0,5.0,5.0,14.0,14.0,12.0,11.0,16.0,15.0,1.0,1.0,1.0,10.0


In [190]:
df['form_5_diff'] = df['home_form_5'] - df['away_form_5']
df['form_15_diff'] = df['home_form_15'] - df['away_form_15']
df['last_home_matches_count_diff'] = df['last_home_matches_count'] - df['last_away_matches_count']
df['days_off_diff'] = df['days_off_home'] - df['days_off_away']
df['is_last_season_champion_diff'] = df['is_last_season_champion_home'] - df['is_last_season_champion_away']
df = df[df['is_playoffs'] == 0].reset_index(drop=True)

df.drop(columns=['home_form_5', 'away_form_5', 'home_form_15', 'away_form_15', 'last_home_matches_count', 'last_away_matches_count', 'days_off_home', 'days_off_away', 'is_last_season_champion_home', 'is_last_season_champion_away', 'is_playoffs'], inplace=True)

In [191]:
print(df.columns)
display(df.describe())

Index(['is_home_win', 'home_over_away_wins_diff_10', 'form_5_diff',
       'form_15_diff', 'last_home_matches_count_diff', 'days_off_diff',
       'is_last_season_champion_diff'],
      dtype='object')


Unnamed: 0,is_home_win,home_over_away_wins_diff_10,form_5_diff,form_15_diff,last_home_matches_count_diff,days_off_diff,is_last_season_champion_diff
count,12723.0,12723.0,12723.0,12723.0,12723.0,12723.0,12723.0
mean,0.580838,-0.021221,-0.007938,0.003851,0.018706,0.02413,-0.000236
std,0.493441,2.578225,1.610232,2.796072,2.019469,1.785626,0.260452
min,0.0,-10.0,-5.0,-11.0,-11.0,-13.0,-1.0
25%,0.0,-2.0,-1.0,-2.0,-1.0,-1.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,2.0,1.0,2.0,1.0,1.0,0.0
max,1.0,10.0,5.0,11.0,11.0,10.0,1.0


Что сделали к этому этапу? Оставили поменьше признаков, чтобы модель не переобучалась + сделали существующие признаки разностными, исследования показывают, что в таком виде моделям удобнее их воспринимать (имеется в виду, вместо двух колонок home_form и away_form мы делаем одну form_diff)

In [192]:
df = df.tail(10000).reset_index(drop=True)


In [193]:
X = df.drop(columns=['is_home_win'])
y = df['is_home_win']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(8000, 6) (2000, 6) (8000,) (2000,)


In [194]:
# Посчитаем количество дубликатов в данных
num_duplicates = df.duplicated().sum()
print(f"Количество дубликатов в данных: {num_duplicates}")


Количество дубликатов в данных: 2243


In [None]:
grouped = df.groupby(list(X.columns))['is_home_win'].value_counts().unstack(fill_value=0)
grouped = grouped.rename(columns={0: 'away_win_count', 1: 'home_win_count'})
grouped = grouped.reset_index()
grouped.shape
# Выведем топ строк по количеству домашних побед (home_win_count) в grouped
top_home_win = grouped.sort_values(by='home_win_count', ascending=False).head(10)
display(top_home_win)



is_home_win,home_over_away_wins_diff_10,form_5_diff,form_15_diff,last_home_matches_count_diff,days_off_diff,is_last_season_champion_diff,away_win_count,home_win_count
3013,0.0,-1.0,-1.0,0.0,0.0,0.0,10,12
3838,0.0,1.0,2.0,0.0,0.0,0.0,4,12
1847,-2.0,0.0,2.0,0.0,0.0,0.0,2,11
3340,0.0,0.0,-1.0,-1.0,0.0,0.0,5,9
2972,0.0,-1.0,-2.0,0.0,0.0,0.0,6,9
3060,0.0,-1.0,0.0,0.0,0.0,0.0,7,9
5987,4.0,0.0,1.0,0.0,0.0,0.0,3,9
1541,-2.0,-1.0,-1.0,0.0,0.0,0.0,1,9
3154,0.0,-1.0,2.0,0.0,0.0,0.0,2,8
3409,0.0,0.0,0.0,1.0,0.0,0.0,2,8


In [196]:
# Попробуем обычную логистическую регрессию

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

# Получим веса (коэффициенты) модели после обучения
print("Коэффициенты (weights) модели:")
for feature, weight in zip(X_train.columns, lr.coef_[0]):
    print(f"{feature}: {weight:.4f}")

print("Смещение (intercept):", lr.intercept_[0])


0.5895
0.5
0.7417426863793646
Коэффициенты (weights) модели:
home_over_away_wins_diff_10: 0.0014
form_5_diff: 0.0119
form_15_diff: 0.0050
last_home_matches_count_diff: 0.0093
days_off_diff: -0.0058
is_last_season_champion_diff: 0.0605
Смещение (intercept): 0.2949038144783455


In [197]:
# Попробуем сделать классификацию с помощью логистической регрессии, сразу найдем лучший гиперпараметр С (используя метрику roc_auc)

params = {'C' : [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]} # диапазон подобрал экспериментально, от 0.001 до 10000 это лучше

gs = GridSearchCV(LogisticRegression(), params, cv=3, scoring='roc_auc')
gs.fit(X, y)

print(gs.best_score_)
print(gs.best_params_)

# Найдем значение accuracy, полученное от такого С

lr = LogisticRegression(C=gs.best_params_['C'])
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print('accuracy', accuracy_score(y_test, y_pred)) # Просто для информации, чтобы знать, насколько хорошо работает модель

print('-' * 50)

# Теперь то же самое для f1

params = {'C' : [1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5]} # диапазон подобрал экспериментально

gs = GridSearchCV(LogisticRegression(), params, cv=3, scoring='f1')
gs.fit(X, y)

print(gs.best_score_)
print(gs.best_params_)

# Найдем значение accuracy, полученное от такого С

lr = LogisticRegression(C=gs.best_params_['C'])
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print('accuracy', accuracy_score(y_test, y_pred)) # Просто для информации, чтобы знать, насколько хорошо работает модель

print('-' * 50)

# Теперь попробуем SVM

svm = SVC()
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
print('accuracy', accuracy_score(y_test, y_pred))
print('roc_auc', roc_auc_score(y_test, y_pred))
print('f1', f1_score(y_test, y_pred))



0.4975252622522164
{'C': 70}
accuracy 0.5895
--------------------------------------------------
0.7312864668643176
{'C': 1e-11}
accuracy 0.5895
--------------------------------------------------
accuracy 0.584
roc_auc 0.4971842815656448
f1 0.735705209656925


In [198]:
# Теперь попробуем SVM с разными ядрами

kernels = ['linear', 'rbf', 'poly', 'sigmoid']

for kernel in kernels:
    svm = SVC(kernel=kernel)
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)
    print(f'Ядро: {kernel}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')
    print(f'F1: {f1_score(y_test, y_pred)}')
    print('-' * 50)

# rbf лучше всего работает, хоть и все еще недостаточно хорошо, попробуем подобрать гиперпараметры

# params = {'C' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# gs = GridSearchCV(SVC(kernel='rbf'), params, cv=3, scoring='roc_auc')
# gs.fit(X, y)

# print(gs.best_score_)
# print(gs.best_params_)

Ядро: linear
Accuracy: 0.5895
ROC AUC: 0.5
F1: 0.7417426863793646
--------------------------------------------------
Ядро: rbf
Accuracy: 0.584
ROC AUC: 0.4971842815656448
F1: 0.735705209656925
--------------------------------------------------
Ядро: poly
Accuracy: 0.5895
ROC AUC: 0.5
F1: 0.7417426863793646
--------------------------------------------------
Ядро: sigmoid
Accuracy: 0.5175
ROC AUC: 0.5045797394311122
F1: 0.5849462365591399
--------------------------------------------------
