In [1]:
import pandas as pd
import numpy as np
import pickle
import os

from tqdm.auto import tqdm

## Пункт 1

Вспомогательный функци загрузки и очистки данных

In [2]:
def load_data():
    players = pd.read_pickle("players.pkl")
    results = pd.read_pickle("results.pkl")
    tournaments = pd.read_pickle("tournaments.pkl")

    players = pd.DataFrame(players).T
    tournaments = pd.DataFrame(tournaments).T

    return players, results, tournaments

def filter_tournaments_by_date(tournaments, start_date, end_date=None):
    if end_date:
        return tournaments[(tournaments['dateStart'] >= start_date) & (tournaments['dateStart'] < end_date)].index
    else:
        return tournaments[tournaments['dateStart'] >= start_date].index

def clean_indices(idx_list, results):
    cleaned_idx_list = []
    for i in tqdm(idx_list):
        try:
            if results[i][0]['team'] and results[i][0]['mask']:
                cleaned_idx_list.append(i)
        except:
            continue
    return cleaned_idx_list

In [3]:
# Загружаем данные
players, results, tournaments = load_data()

# Определение индексов турниров для обучающего и тестового наборов данных
idx_list_train = filter_tournaments_by_date(tournaments, '2019', '2020')
idx_list_test = filter_tournaments_by_date(tournaments, '2020')

print('Data train length: ', len(idx_list_train))
print('Data test length: ', len(idx_list_test))

# Очистка списков индексов турниров
idx_list_train_cleaned = clean_indices(idx_list_train, results)
idx_list_test_cleaned = clean_indices(idx_list_test, results)

print('Clean data train length: ', len(idx_list_train_cleaned))
print('Clean data test length: ', len(idx_list_test_cleaned))

Data train length:  687
Data test length:  422


  0%|          | 0/687 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

Clean data train length:  674
Clean data test length:  173


Функция create_df предназначена для создания датафрейма, содержащего данные об игроках, их командах и ответах на вопросы для заданных турниров.  
Функция проходит по каждому индексу турнира в idx_list и извлекает информацию о команде и её ответах на вопросы из словаря results.      

Таким образом, созданный датафрейм содержит все необходимые данные об игроках, командах и их ответах на вопросы, которые можно использовать для дальнейшего анализа и обучения модели. Загрузка данных в этой функции происходит в структурированном и последовательном порядке, что обеспечивает простоту и понимание процесса.

In [4]:
def create_df(idx_list):
    df_results = []
    for idx in tqdm(idx_list):
        for team in results[idx]:
            if 'mask' in team and team['mask']:
                mask = team['mask'].translate(str.maketrans('X?', '00'))
                players = team['teamMembers']
                team_id = team['team']['id']
                for player in players:  
                    player_id = player['player']['id']
                    for no_q, answer in enumerate(mask): 
                        df_results.append([idx, team_id, player_id, no_q, answer])
    df = pd.DataFrame(df_results, columns=['tournament_id', 'team_id', 'player_id', 'question', 'answer'])
    return df

In [5]:
df_train = create_df(idx_list_train)

  0%|          | 0/687 [00:00<?, ?it/s]

In [6]:
df_test = create_df(idx_list_test)

  0%|          | 0/422 [00:00<?, ?it/s]

In [7]:
df_train.head()

Unnamed: 0,tournament_id,team_id,player_id,question,answer
0,4772,45556,6212,0,1
1,4772,45556,6212,1,1
2,4772,45556,6212,2,1
3,4772,45556,6212,3,1
4,4772,45556,6212,4,1


## Пункт 2

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_features = ['tournament_id', 'player_id']
X, y = df_train[categorical_features], df_train['answer']
X_test, y_test = df_test[categorical_features], df_test['answer']

preprocessor = ColumnTransformer(
    transformers=[
        ("Dummies", categorical_transformer, categorical_features),
    ]
)
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(solver='liblinear'))]
)
clf.fit(X, y);

In [12]:
# predict probabiliti of right answer
y_pred = clf.predict_proba(X_test)[:, 1]

# mode estimation with ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC score: {roc_auc:.4f}")

ROC-AUC score: 0.6176


In [14]:
n_players = df_train.player_id.nunique()
n_tourn = df_train.tournament_id.nunique()
powers = clf['classifier'].coef_[0][n_tourn:]
players_ids = sorted(df_train.player_id.unique())
players_rating = dict(zip(players_ids, powers))

## Пункт 3

Функция validation оценивает качество предсказаний модели на тестовых данных, вычисляя корреляции Спирмена и Кендалла между реальными и предсказанными позициями команд в турнирах. Она формирует словарь рейтинга игроков, агрегирует рейтинги по командам и турнирам, и выводит средние значения корреляций.

In [28]:
from scipy import stats

def validation(clf, df, players_ids, results, idx_list, n_tourn):
    # Create a dictionary with player strength
    powers = clf['classifier'].coef_[0][n_tourn:]
    players_rating = dict(zip(players_ids, powers))

    # Add a column with player ratings to the DataFrame
    df['player_rating'] = df['player_id'].map(players_rating)

    # Create the dict_labels from the given results and idx_list_test
    dict_labels = {}
    for idx in tqdm(idx_list):
        dict_labels[idx] = {}
        for team in results[idx]:
            try:
                team_id = team['team']['id']
                team_pos = team['position']
            except:
                continue
            dict_labels[idx][team_id] = team_pos

    # Group data by tournaments and teams, and sum the team ratings
    team_rating = df.groupby(['tournament_id', 'team_id'])['player_rating'].sum().reset_index()
    team_rating.columns = ['tournament_id', 'team_id', 'rating']
    
    team_rating['labels_pred'] = team_rating.groupby('tournament_id')['rating'].rank(ascending=False, method='min')
    team_rating['labels'] = team_rating.apply(lambda row: dict_labels[row['tournament_id']][row['team_id']], axis=1)

    # Calculate correlations separately for each tournament, then find the mean
    spearman_correlations = []
    kendall_correlations = []

    for tournament_id, tournament_data in team_rating.groupby('tournament_id'):
        if len(tournament_data) > 1:
            spearman_corr = stats.spearmanr(tournament_data['labels_pred'], tournament_data['labels']).correlation
            kendall_corr = stats.kendalltau(tournament_data['labels_pred'], tournament_data['labels']).correlation
            
            spearman_correlations.append(spearman_corr)
            kendall_correlations.append(kendall_corr)

    print(f"Spearman correlation: {np.mean(spearman_correlations):.2f}")
    print(f"Kendall correlation: {np.mean(kendall_correlations):.2f}")


In [32]:
players_ids = sorted(df_train.player_id.unique())
df = df_test.groupby(['tournament_id','team_id','player_id']).question.count().reset_index()
n_tourn = df_train.tournament_id.nunique()

In [33]:
validation(
    clf, df,
    players_ids,  
    results,
    idx_list_test, n_tourn
)

  0%|          | 0/422 [00:00<?, ?it/s]

Spearman correlation =  0.78
Kendall correlation =  0.62


## Пункт 4

In [39]:
def train_model(X, y):
    categorical_transformer = OneHotEncoder()
    categorical_features = ['tournament_id', 'player_id']

    preprocessor = ColumnTransformer(
        transformers=[
            ("Dummies", categorical_transformer, categorical_features),
        ]
    )
    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(solver='liblinear'))]
    )
    clf.fit(X, y)
    return clf

def e_step(df, y_pred):
    df = df.copy()
    df['predict'] = y_pred
    df.loc[df['answer'] == 0, 'predict'] = 0
    y_new = df.groupby(['tournament_id', 'team_id', 'question'])['predict'].transform('max')
    return y_new

def m_step(X, y):
    clf = train_model(X, y)
    return clf, clf.predict(X)

def EM_scheme(X, y):
    n_epoch = 5
    clf = train_model(X, y)

    for epoch in range(n_epoch):
        print(f'Epoch {epoch + 1}')
        y_pred = clf.predict(X)
        y = e_step(df_train, y_pred)
        clf, y_pred = m_step(X, y)
        validation(
            clf, df,
            players_ids,  
            results,
            idx_list_test, n_tourn
        )
    return clf

In [None]:
clf_em = EM_scheme(X, y)

Epoch 1


  0%|          | 0/422 [00:00<?, ?it/s]

Spearman correlation =  0.72
Kendall correlation =  0.56
Epoch 2
