# Imports

In [None]:
!pip install kickscore
!pip install trueskill

Installing collected packages: trueskill
Successfully installed trueskill-0.4.5


In [None]:
import pandas as pd
import kickscore as ks
from datetime import datetime
import warnings
import copy
import numpy as np
import pickle
import os
import itertools
import matplotlib.pyplot as plt
import abc
from math import log
from datetime import datetime
from sklearn.metrics import log_loss, accuracy_score
import collections
import trueskill as ts
from math import exp, erfc, sqrt
from google.colab import files

%matplotlib inline

# Input

In [None]:
input = pd.read_csv("EPL data with seasons.csv")

df = pd.read_csv("EPL data with seasons.csv")
df = df.dropna()
df['timestamp'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['timestamp'] = df['timestamp'].astype('int64') / 10**9
df = df.sort_values(['timestamp'])
df = df.reset_index(drop=True)

df[['score1', 'score2']] = df['score'].str.split(' - ', expand=True)

team_names = pd.concat([df['team1'], df['team2']]).unique()

df = df.drop('score', axis=1)

columns_to_drop = ['season', 'date']
saved_columns = df[columns_to_drop].copy()
df = df.drop(columns_to_drop, axis=1)

df.rename(columns={'timestamp': 't'}, inplace=True)

df['neutral'] = False
df['bonus'] = 0

# Building dataset

## Kickscore

### Classes

In [None]:
class PredictiveModel(metaclass=abc.ABCMeta):

    @abc.abstractmethod
    def fit(self, *, cutoff=None):
        """Fit the model.

        `cutoff` is an optional datetime object. Only observations up to
        `cutoff` (exclusive) are used to fit the model.
        """

    @abc.abstractmethod
    def evaluate(self, *, begin, end):
        """Evaluate the model on a test set.

        The model is evaluated on observations between `begin` (inclusive) and
        `end` (exclusive). Returns a triplet containing:

        1. number of observations
        2. sum of log-loss
        3. sum of accuracy
        """

    @property
    @abc.abstractmethod
    def log_likelihood(self):
        """Compute the marginal log-likelihood of the model."""

    @classmethod
    @abc.abstractmethod
    def get_dates(cls, begin=None):
        """Return dates of observations in the dataset."""

In [None]:
class KickScoreModel(PredictiveModel, metaclass=abc.ABCMeta):

    def __init__(self, ks_model, fit_params):
        self.ks_model = ks_model
        self.fit_params = fit_params

    @abc.abstractmethod
    def observe(self, **kwargs):
        """Add observation to the model."""

    @abc.abstractmethod
    def evaluate_obs(self, **kwargs):
        """Evaluate observation wit a fitted model."""

    def fit(self, *, cutoff=None):
#         if cutoff is not None:
#             cutoff_ts = int(cutoff.timestamp())
#         else:
#             cutoff_ts = float("inf")
#         for obs in iterate_dataset(self.dataset):
#             if obs["t"] >= cutoff_ts:
#                 break
#             self.observe(**obs)
        converged = self.ks_model.fit(**self.fit_params)
        return converged

    def evaluate(self, *, begin, end):
#         begin_ts = int(begin.timestamp())
#         end_ts = int(end.timestamp())
        begin_ts = begin
        end_ts = end
        log_loss = 0
        accuracy = 0
        n_obs = 0
        for obs in iterate_dataset(self.dataset):
            if begin_ts <= obs["t"] < end_ts:
                ll, acc = self.evaluate_obs(**obs)
                log_loss += ll
                accuracy += acc
                n_obs += 1
        return n_obs, log_loss, accuracy

    @property
    def log_likelihood(self):
        return self.ks_model.log_likelihood

    @classmethod
    def get_dates(cls, begin=None):
        if begin is None:
            cutoff = float("-inf")
        else:
            cutoff = int(begin.timestamp())
        dates = set()
        for obs in iterate_dataset(cls.dataset):
            if obs["t"] >= cutoff:
                dates.add(datetime.fromtimestamp(
                        obs["t"], timezone.utc).date())
        return sorted(dates)


def iterate_dataset(fname):
#     with open(data_path(fname)) as f:
    for index, row in enumerate(train.iterrows()):
        yield row[1]

In [None]:
class TernaryModel(KickScoreModel, metaclass=abc.ABCMeta):

    dataset = df

    def __init__(self, margin, obs_type="probit",
            method="ep", max_iter=500, lr=1.0):
        ks_model = ks.TernaryModel(margin=margin, obs_type=obs_type)
        fit_params = {"method": method, "max_iter": max_iter, "lr": lr}
        super().__init__(ks_model, fit_params)

    @abc.abstractmethod
    def make_features(self, team1, team2, neutral, bonus):
        """Make feature vectors for the two teams."""

    def observe(self, *, t, team1, team2, score1, score2, neutral, bonus):
        feats1, feats2 = self.make_features(team1, team2, neutral, bonus)
        # Add observation.
        if score1 > score2:
            self.ks_model.observe(feats1, feats2, t=t)
        elif score1 < score2:
            self.ks_model.observe(feats2, feats1, t=t)
        else:
            self.ks_model.observe(feats1, feats2, t=t, tie=True)

    def evaluate_obs(self, *, t, team1, team2, score1, score2, neutral, bonus):
        feats1, feats2 = self.make_features(team1, team2, neutral, bonus)
        probs = self.ks_model.probabilities(feats1, feats2, t=t)
        if score1 > score2:
            outcome = 0
        elif score1 == score2:
            outcome = 1
        else:
            outcome = 2
        log_loss = -log(probs[outcome])
        accuracy = 1.0 if outcome == np.argmax(probs) else 0.0
        return log_loss, accuracy

In [None]:
class HomeAdvantageModel(TernaryModel):

    def __init__(self, *, margin, cvar, dvar, lscale, havar):
        super().__init__(margin, obs_type="probit", method="ep")
        self._kern = (ks.kernel.Constant(var=cvar)
                + ks.kernel.Exponential(var=dvar, lscale=(lscale * YEAR)))
        self.ks_model.add_item("home-advantage",
                kernel=ks.kernel.Constant(var=havar))

    def make_features(self, team1, team2, neutral, bonus):
        # Add items if needed.
        if team1 not in self.ks_model.item:
            self.ks_model.add_item(team1, kernel=self._kern)
        if team2 not in self.ks_model.item:
            self.ks_model.add_item(team2, kernel=self._kern)
        # Account for the home-advantage if needed.
        if neutral:
            return [team1], [team2]
        else:
            return [team1, "home-advantage"], [team2]



### Learning

In [None]:
train = df
YEAR = 365.25 * 24 * 60 * 60

model = HomeAdvantageModel(
    margin=0.40781,
    cvar=0.89479,
    dvar=0.21987,
    lscale=44.47171,
    havar=0.16199
)

for index, row in enumerate(train.iterrows()):
    model.observe(t=train.iloc[index]['t'],
                  team1=train.iloc[index]['team1'], team2=train.iloc[index]['team2'],
                  score1=train.iloc[index]['score1'], score2=train.iloc[index]['score2'],
                  neutral=False, bonus=0)

In [None]:
model.fit()

True

### Test

In [None]:
n_obs, log_loss, accuracy = model.evaluate(begin=train.iloc[0]['t'], end=train.iloc[-1]['t'])
print(n_obs, log_loss/n_obs, accuracy/n_obs)

31407 0.9849641069671178 0.5272391505078485


### Results

In [None]:
items= model.ks_model.item

ks_data = {}

for name in items:
    if name == 'home-advantage':
        continue
    else:
        ts, _, _ = model.ks_model.item[name].scores
        ms, vs = model.ks_model.item[name].predict(ts)
        std = np.sqrt(vs)
        for t, m, s in zip(ts, ms, std):
            ks_data[(name, t)] = (m, s)

In [None]:
def kickscore_search(team, ts):
    key = (team, ts)
    if key in ks_data:
        return ks_data[key]
    else:
        return None, None

In [None]:
columns = ['KICKSCORE_MS_T1', 'KICKSCORE_STD_T1', 'KICKSCORE_MS_T2', 'KICKSCORE_STD_T2']

for column in columns:
  df[column] = None

for index, row in df.iterrows():
  team1 = row['team1']
  team2 = row['team2']
  ts = row['t']

  ks_ms1, ks_std1 = kickscore_search(team1, ts)
  ks_ms2, ks_std2 = kickscore_search(team2, ts)

  df.at[index, 'KICKSCORE_MS_T1'] = ks_ms1
  df.at[index, 'KICKSCORE_STD_T1'] = ks_std1

  df.at[index, 'KICKSCORE_MS_T2'] = ks_ms2
  df.at[index, 'KICKSCORE_STD_T2'] = ks_std2

In [None]:
team_last_values = {team_name: (None, None) for team_name in team_names}

for index, row in df.iterrows():
  team1 = row['team1']
  team2 = row['team2']

  t1_last_values = team_last_values[team1]
  t2_last_values = team_last_values[team2]

  team_last_values[team1] = row['KICKSCORE_MS_T1'], row['KICKSCORE_STD_T1']
  team_last_values[team2] = row['KICKSCORE_MS_T2'], row['KICKSCORE_STD_T2']

  df.at[index, 'KICKSCORE_MS_T1'], df.at[index, 'KICKSCORE_STD_T1'] = t1_last_values
  df.at[index, 'KICKSCORE_MS_T2'], df.at[index, 'KICKSCORE_STD_T2'] = t2_last_values

## HSCS

In [None]:
df[columns_to_drop] = saved_columns

df['start_year'] = df['season'].apply(lambda x: int(x.split('/')[0]))
df['season_num'] = df['start_year'].apply(lambda x: x - df['start_year'].min() + 1)
df.drop('start_year', axis=1, inplace=True)

In [None]:
# Reset team performances to account for statistics prior to the current match
team_performances = {team: {season: {'home_wins': 0, 'home_games': 0, 'away_wins': 0, 'away_games': 0, 'home_draws': 0, 'away_draws': 0} for season in df['season_num'].unique()} for team in pd.concat([df['team1'], df['team2']]).unique()}
last_match_date = {}


# Reinitialize the percentage columns to zero
columns = ['H_WIN_PCT_T1', 'A_WIN_PCT_T1', 'H_DRAW_PCT_T1', 'A_DRAW_PCT_T1', 'H_WIN_PCT_T2', 'A_WIN_PCT_T2', 'H_DRAW_PCT_T2', 'A_DRAW_PCT_T2']
for column in columns:
    df[column] = None

extra_columns = ['CUR_WIN_PCT_T1', 'CUR_DRAW_PCT_T1', 'CUR_WIN_PCT_T2', 'CUR_DRAW_PCT_T2']
for column in extra_columns:
    df[column] = None

for index, row in df.iterrows():
    current_season = row['season_num']
    home_team = row['team1']
    away_team = row['team2']

    relevant_seasons = [current_season, current_season-1, current_season-2]

    stats = {team: {'home_games': 0, 'home_wins': 0, 'home_draws': 0, 'away_games': 0, 'away_wins': 0, 'away_draws': 0} for team in [home_team, away_team]}
    for team in [home_team, away_team]:
        for season in relevant_seasons:
            if season in team_performances[team]:
                s_stats = team_performances[team][season]
                stats[team]['home_games'] += s_stats['home_games']
                stats[team]['home_wins'] += s_stats['home_wins']
                stats[team]['home_draws'] += s_stats['home_draws']
                stats[team]['away_games'] += s_stats['away_games']
                stats[team]['away_wins'] += s_stats['away_wins']
                stats[team]['away_draws'] += s_stats['away_draws']


    # Calculate and assign percentages based on stats before this match
    if stats[home_team]['home_games'] > 0:
        df.at[index, 'H_WIN_PCT_T1'] = stats[home_team]['home_wins'] / stats[home_team]['home_games']
        df.at[index, 'H_DRAW_PCT_T1'] = stats[home_team]['home_draws'] / stats[home_team]['home_games']
    if stats[away_team]['away_games'] > 0:
        df.at[index, 'A_WIN_PCT_T2'] = stats[away_team]['away_wins'] / stats[away_team]['away_games']
        df.at[index, 'A_DRAW_PCT_T2'] = stats[away_team]['away_draws'] / stats[away_team]['away_games']

    if stats[home_team]['away_games'] > 0:
        df.at[index, 'A_WIN_PCT_T1'] = stats[home_team]['away_wins'] / stats[home_team]['away_games']
        df.at[index, 'A_DRAW_PCT_T1'] = stats[home_team]['away_draws'] / stats[home_team]['away_games']
    if stats[away_team]['home_games'] > 0:
        df.at[index, 'H_WIN_PCT_T2'] = stats[away_team]['home_wins'] / stats[away_team]['home_games']
        df.at[index, 'H_DRAW_PCT_T2'] = stats[away_team]['home_draws'] / stats[away_team]['home_games']

    # Current strength
    games_t1 = team_performances[home_team][current_season]['home_games'] + team_performances[home_team][current_season]['away_games']
    wins_t1 = team_performances[home_team][current_season]['home_wins'] + team_performances[home_team][current_season]['away_wins']
    draws_t1 = team_performances[home_team][current_season]['home_draws'] + team_performances[home_team][current_season]['away_draws']
    if games_t1 >= 5:
       df.at[index, 'CUR_WIN_PCT_T1'] = wins_t1 / games_t1
       df.at[index, 'CUR_DRAW_PCT_T1'] = draws_t1 / games_t1

    games_t2 = team_performances[away_team][current_season]['home_games'] + team_performances[away_team][current_season]['away_games']
    wins_t2 = team_performances[away_team][current_season]['home_wins'] + team_performances[away_team][current_season]['away_wins']
    draws_t2 = team_performances[away_team][current_season]['home_draws'] + team_performances[away_team][current_season]['away_draws']
    if games_t2 >= 5:
       df.at[index, 'CUR_WIN_PCT_T2'] = wins_t2 / games_t2
       df.at[index, 'CUR_DRAW_PCT_T2'] = draws_t2 / games_t2

    # Update the stats after assigning the percentages
    team_performances[home_team][current_season]['home_games'] += 1
    team_performances[away_team][current_season]['away_games'] += 1
    if row['score1'] > row['score2']:
        team_performances[home_team][current_season]['home_wins'] += 1
    elif row['score1'] == row['score2']:
        team_performances[home_team][current_season]['home_draws'] += 1
        team_performances[away_team][current_season]['away_draws'] += 1
    else:
        team_performances[away_team][current_season]['away_wins'] += 1


In [None]:
# Создаем структуру данных для сбора статистики по командам и сезонам
team_goals = {team: {season: {'home_goals_scored': [], 'away_goals_scored': [], 'home_goals_conceded': [], 'away_goals_conceded': []} for season in df['season_num'].unique()} for team in pd.concat([df['team1'], df['team2']]).unique()}
last_match_date = {}

# Добавляем столбцы для каждой статистики в DataFrame
stats_columns = ['H_GS_AVG', 'A_GS_AVG', 'H_GC_AVG', 'A_GC_AVG', 'H_GS_STD', 'A_GS_STD', 'H_GC_STD', 'A_GC_STD']
for stat in stats_columns:
    df[stat + '_T1'] = None  # Для домашней команды
    df[stat + '_T2'] = None  # Для гостевой команды

e_columns = ['CUR_GS_AVG', 'CUR_GC_AVG', 'CUR_GS_STD', 'CUR_GC_STD', 'REST']
for stat in e_columns:
    df[stat + '_T1'] = None  # Для домашней команды
    df[stat + '_T2'] = None  # Для гостевой команды

for index, row in df.iterrows():
    current_season = row['season_num']
    home_team = row['team1']
    away_team = row['team2']
    current_date = pd.to_datetime(row['date'])

    relevant_seasons = [current_season, current_season-1, current_season-2]

    stats = {team: {'home_goals_scored': [], 'away_goals_scored': [], 'home_goals_conceded': [], 'away_goals_conceded': []} for team in [home_team, away_team]}
    for team in [home_team, away_team]:
        for season in relevant_seasons:
            if season in team_goals[team]:
              stats[team]['home_goals_scored'].extend(team_goals[team][season]['home_goals_scored'])
              stats[team]['away_goals_scored'].extend(team_goals[team][season]['away_goals_scored'])
              stats[team]['home_goals_conceded'].extend(team_goals[team][season]['home_goals_conceded'])
              stats[team]['away_goals_conceded'].extend(team_goals[team][season]['away_goals_conceded'])

    # T1
    home_goals_scored = stats[home_team]['home_goals_scored']
    if home_goals_scored:
        df.at[index, 'H_GS_AVG_T1'] = np.mean(home_goals_scored)
        df.at[index, 'H_GS_STD_T1'] = np.std(home_goals_scored)
    else:
        df.at[index, 'H_GS_AVG_T1'] = 0
        df.at[index, 'H_GS_STD_T1'] = 0

    away_goals_scored = stats[home_team]['away_goals_scored']
    if away_goals_scored:
        df.at[index, 'A_GS_AVG_T1'] = np.mean(away_goals_scored)
        df.at[index, 'A_GS_STD_T1'] = np.std(away_goals_scored)
    else:
        df.at[index, 'A_GS_AVG_T1'] = 0
        df.at[index, 'A_GS_STD_T1'] = 0

    home_goals_conceded = stats[home_team]['home_goals_conceded']
    if home_goals_conceded:
        df.at[index, 'H_GC_AVG_T1'] = np.mean(home_goals_conceded)
        df.at[index, 'H_GC_STD_T1'] = np.std(home_goals_conceded)
    else:
        df.at[index, 'H_GC_AVG_T1'] = 0
        df.at[index, 'H_GC_STD_T1'] = 0

    away_goals_conceded = stats[home_team]['away_goals_conceded']
    if away_goals_conceded:
        df.at[index, 'A_GC_AVG_T1'] = np.mean(away_goals_conceded)
        df.at[index, 'A_GC_STD_T1'] = np.std(away_goals_conceded)
    else:
        df.at[index, 'A_GC_AVG_T1'] = 0
        df.at[index, 'A_GC_STD_T1'] = 0

    # T2
    home_goals_scored2 = stats[away_team]['home_goals_scored']
    if home_goals_scored2:
        df.at[index, 'H_GS_AVG_T2'] = np.mean(home_goals_scored2)
        df.at[index, 'H_GS_STD_T2'] = np.std(home_goals_scored2)
    else:
        df.at[index, 'H_GS_AVG_T2'] = 0
        df.at[index, 'H_GS_STD_T2'] = 0

    away_goals_scored2 = stats[away_team]['away_goals_scored']
    if away_goals_scored2:
        df.at[index, 'A_GS_AVG_T2'] = np.mean(away_goals_scored2)
        df.at[index, 'A_GS_STD_T2'] = np.std(away_goals_scored2)
    else:
        df.at[index, 'A_GS_AVG_T2'] = 0
        df.at[index, 'A_GS_STD_T2'] = 0

    home_goals_conceded2 = stats[away_team]['home_goals_conceded']
    if home_goals_conceded2:
        df.at[index, 'H_GC_AVG_T2'] = np.mean(home_goals_conceded2)
        df.at[index, 'H_GC_STD_T2'] = np.std(home_goals_conceded2)
    else:
        df.at[index, 'H_GC_AVG_T2'] = 0
        df.at[index, 'H_GC_STD_T2'] = 0

    away_goals_conceded2 = stats[away_team]['away_goals_conceded']
    if away_goals_conceded2:
        df.at[index, 'A_GC_AVG_T2'] = np.mean(away_goals_conceded2)
        df.at[index, 'A_GC_STD_T2'] = np.std(away_goals_conceded2)
    else:
        df.at[index, 'A_GC_AVG_T2'] = 0
        df.at[index, 'A_GC_STD_T2'] = 0

    # Current form
    # Проверяем есть ли запись о последнем матче для каждой команды
    if home_team in last_match_date:
        df.at[index, 'REST_T1'] = (current_date - last_match_date[home_team]).days
    if away_team in last_match_date:
        df.at[index, 'REST_T2'] = (current_date - last_match_date[away_team]).days

    # Для забитых голов
    gs1 = team_goals[home_team][current_season]['home_goals_scored'][-5:] + team_goals[home_team][current_season]['away_goals_scored'][-5:]
    # Для пропущенных голов
    gc1 = team_goals[home_team][current_season]['home_goals_conceded'][-5:] + team_goals[home_team][current_season]['away_goals_conceded'][-5:]

    if gs1 and len(gs1) >= 5:
      df.at[index, 'CUR_GS_AVG_T1'] = np.mean(gs1)
      df.at[index, 'CUR_GS_STD_T1'] = np.std(gs1)
      df.at[index, 'CUR_GC_AVG_T1'] = np.mean(gc1)
      df.at[index, 'CUR_GC_STD_T1'] = np.std(gc1)

    # Для забитых голов
    gs2 = team_goals[away_team][current_season]['home_goals_scored'][-5:] + team_goals[away_team][current_season]['away_goals_scored'][-5:]
    # Для пропущенных голов
    gc2 = team_goals[away_team][current_season]['home_goals_conceded'][-5:] + team_goals[away_team][current_season]['away_goals_conceded'][-5:]
    if gs2 and len(gs2) >= 5:
      df.at[index, 'CUR_GS_AVG_T2'] = np.mean(gs2)
      df.at[index, 'CUR_GS_STD_T2'] = np.std(gs2)
      df.at[index, 'CUR_GC_AVG_T2'] = np.mean(gc2)
      df.at[index, 'CUR_GC_STD_T2'] = np.std(gc2)

    # Update
    team_goals[home_team][current_season]['home_goals_scored'].append(int(row['score1']))
    team_goals[home_team][current_season]['home_goals_conceded'].append(int(row['score2']))
    team_goals[away_team][current_season]['away_goals_scored'].append(int(row['score2']))
    team_goals[away_team][current_season]['away_goals_conceded'].append(int(row['score1']))

    last_match_date[home_team] = current_date
    last_match_date[away_team] = current_date

In [None]:
df

Unnamed: 0,team1,team2,t,score1,score2,neutral,bonus,KICKSCORE_MS_T1,KICKSCORE_STD_T1,KICKSCORE_MS_T2,...,CUR_GS_AVG_T1,CUR_GS_AVG_T2,CUR_GC_AVG_T1,CUR_GC_AVG_T2,CUR_GS_STD_T1,CUR_GS_STD_T2,CUR_GC_STD_T1,CUR_GC_STD_T2,REST_T1,REST_T2
0,Wolves,Liverpool,-6.112800e+08,2,0,False,0,,,,...,,,,,,,,,,
1,Aston Villa,West Brom,-6.112800e+08,2,0,False,0,,,,...,,,,,,,,,,
2,Sunderland,Derby County,-6.112800e+08,1,0,False,0,,,,...,,,,,,,,,,
3,Stoke,Newcastle,-6.112800e+08,1,2,False,0,,,,...,,,,,,,,,,
4,Chelsea,Wednesday,-6.112800e+08,4,0,False,0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31404,Everton,West Ham,1.709338e+09,1,3,False,0,-0.147124,0.132294,0.01322,...,0.8,1.1,1.4,2.1,0.748331,1.3,1.113553,1.972308,7,5
31405,Notts Forest,Liverpool,1.709338e+09,0,1,False,0,-0.320499,0.155937,0.823358,...,1.8,2.9,1.9,1.1,0.6,1.220656,1.220656,0.830662,7,10
31406,Luton Town,Aston Villa,1.709338e+09,2,3,False,0,-0.512274,0.216563,0.194171,...,1.9,2.1,2.2,1.5,1.220656,1.445683,1.4,1.024695,10,7
31407,Burnley,Bournemouth,1.709424e+09,0,2,False,0,-0.481883,0.149945,-0.219858,...,0.9,1.4,2.4,1.9,0.830662,1.019804,1.280625,1.135782,8,8


In [None]:
df

Unnamed: 0,team1,team2,t,score1,score2,neutral,bonus,KICKSCORE_MS_T1,KICKSCORE_STD_T1,KICKSCORE_MS_T2,...,CUR_GS_AVG_T1,CUR_GS_AVG_T2,CUR_GC_AVG_T1,CUR_GC_AVG_T2,CUR_GS_STD_T1,CUR_GS_STD_T2,CUR_GC_STD_T1,CUR_GC_STD_T2,REST_T1,REST_T2
0,Wolves,Liverpool,-6.112800e+08,2,0,False,0,,,,...,,,,,,,,,,
1,Aston Villa,West Brom,-6.112800e+08,2,0,False,0,,,,...,,,,,,,,,,
2,Sunderland,Derby County,-6.112800e+08,1,0,False,0,,,,...,,,,,,,,,,
3,Stoke,Newcastle,-6.112800e+08,1,2,False,0,,,,...,,,,,,,,,,
4,Chelsea,Wednesday,-6.112800e+08,4,0,False,0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31404,Everton,West Ham,1.709338e+09,1,3,False,0,-0.147124,0.132294,0.01322,...,0.8,1.1,1.4,2.1,0.748331,1.3,1.113553,1.972308,7,5
31405,Notts Forest,Liverpool,1.709338e+09,0,1,False,0,-0.320499,0.155937,0.823358,...,1.8,2.9,1.9,1.1,0.6,1.220656,1.220656,0.830662,7,10
31406,Luton Town,Aston Villa,1.709338e+09,2,3,False,0,-0.512274,0.216563,0.194171,...,1.9,2.1,2.2,1.5,1.220656,1.445683,1.4,1.024695,10,7
31407,Burnley,Bournemouth,1.709424e+09,0,2,False,0,-0.481883,0.149945,-0.219858,...,0.9,1.4,2.4,1.9,0.830662,1.019804,1.280625,1.135782,8,8


## TrueSkill

### Classes

In [None]:
import trueskill as ts

class TrueSkillModel:

    def __init__(self, *, margin, sigma, tau):
        self.env = ts.TrueSkill(
            # Initial mean of rating.
            mu=0.0,
            # Initial std. dev. of rating.
            sigma=sigma,
            # Scales the sigmoid function (denominator is sqrt(2) * beta).
            beta=1/sqrt(2.0),
            # Std. dev. of brownian dynamics.
            tau=tau,
            # Draw probability if skill difference is 0.
            draw_probability=(2 * ndtr(margin) - 1.0),
        )
        self.margin = margin
        self.rating = collections.defaultdict(self.env.create_rating)

    def observe(self, winner, loser, tie=False):
        self.rating[winner], self.rating[loser] = ts.rate_1vs1(
                self.rating[winner], self.rating[loser], drawn=tie,
                env=self.env)

    def predict(self, a, b):
        ma, sa = self.rating[a]
        mb, sb = self.rating[b]
        denom = sqrt(1.0 + sa*sa + sb*sb)
        pa = ndtr((ma - mb - self.margin) / denom)
        pb = ndtr((mb - ma - self.margin) / denom)
        return {
            'probs': (pa, 1.0 - pa - pb, pb),
            't1': (ma, sa),
            't2': (mb, sb)
        }


def ndtr(x):
    """Normal cumulative density function."""
    # If X ~ N(0,1), returns P(X < x).
    return erfc(-x / sqrt(2.0)) / 2.0

### Learning

In [None]:
# Инициализация модели TrueSkill
true_skill_model = TrueSkillModel(margin=0.1, sigma=25, tau=0.5)  # Указать реальные значения

true_skill_data = {}

for index, row in df.iterrows():
    prediction = true_skill_model.predict(row['team1'], row['team2'])
    true_skill_data[(row['team1'], row['t'])] = prediction['t1'][0], prediction['t1'][1]
    true_skill_data[(row['team2'], row['t'])] = prediction['t2'][0], prediction['t2'][1]

    if row['score1'] >= row['score2']:
      true_skill_model.observe(row['team1'], row['team2'], False)
    elif row['score1'] < row['score2']:
      true_skill_model.observe(row['team2'], row['team1'], False)
    else:
      true_skill_model.observe(row['team1'], row['team2'], True)

### Result

In [None]:
def trueskill_search(team, ts):
    key = (team, ts)
    if key in true_skill_data:
        return true_skill_data[key]
    else:
        return None, None

In [None]:
columns = ['TRUESKILL_MS_T1', 'TRUESKILL_STD_T1', 'TRUESKILL_MS_T2', 'TRUESKILL_STD_T2']

for column in columns:
  df[column] = None

for index, row in df.iterrows():
  team1 = row['team1']
  team2 = row['team2']
  ts = row['t']

  ts_ms1, ts_std1 = trueskill_search(team1, ts)
  ts_ms2, ts_std2 = trueskill_search(team2, ts)

  df.at[index, 'TRUESKILL_MS_T1'] = ts_ms1
  df.at[index, 'TRUESKILL_STD_T1'] = ts_std1

  df.at[index, 'TRUESKILL_MS_T2'] = ts_ms2
  df.at[index, 'TRUESKILL_STD_T2'] = ts_std2


## Elo

### Classes

In [None]:
class EloModel:

    def __init__(self, *, margin, lr):
        self.margin = margin
        self.lr = lr
        self.score = collections.defaultdict(lambda: 0)

    def observe(self, winner, loser, tie=False):
        prediction = self.predict(winner, loser)
        if tie:
            delta = self.lr * (prediction['preds'][2] - prediction['preds'][0])
        else:
            delta = self.lr * (1.0 - prediction['preds'][0])
        self.score[winner] += delta
        self.score[loser] -= delta

    def predict(self, a, b):
        pa = 1.0 / (1.0 + exp(-(self.score[a] - self.score[b] - self.margin)))
        pb = 1.0 / (1.0 + exp(-(self.score[b] - self.score[a] - self.margin)))
        return {'preds': (pa, 1.0 - pa - pb, pb), 't1': self.score[a], 't2': self.score[b]}

### Learning

In [None]:
elo_data = {}

elo_model = EloModel(margin=0.1, lr=1)

for index, row in df.iterrows():
    prediction = elo_model.predict(row['team1'], row['team2'])

    elo_data[(row['team1'], row['t'])] = prediction['t1']
    elo_data[(row['team2'], row['t'])] = prediction['t2']

    if row['score1'] > row['score2']:
      elo_model.observe(row['team1'], row['team2'], False)
    elif row['score1'] < row['score2']:
      elo_model.observe(row['team2'], row['team1'], False)
    else:
      elo_model.observe(row['team1'], row['team2'], True)

### Result

In [None]:
def elo_search(team, ts):
    key = (team, ts)
    if key in elo_data:
        return elo_data[key]
    else:
        return None

In [None]:
columns = ['ELO_T1', 'ELO_T2']

for column in columns:
  df[column] = None

for index, row in df.iterrows():
  team1 = row['team1']
  team2 = row['team2']
  ts = row['t']

  elo1 = elo_search(team1, ts)
  elo2 = elo_search(team2, ts)

  df.at[index, 'ELO_T1'] = elo1
  df.at[index, 'ELO_T2'] = elo2

# Output

In [None]:
excel_path = "FullAPLDataset_v1.xlsx"

# Сохраняем DataFrame в Excel
df.to_excel(excel_path, index=False)

# Скачиваем файл
files.download(excel_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# TODO