In [1]:
import pickle
import pandas as pd
import random
import numpy as np

In [2]:
with open('players.pkl', 'rb') as f:
    players = pickle.load(f)
with open('results.pkl', 'rb') as f:
    results = pickle.load(f)
with open('tournaments.pkl', 'rb') as f:
    tournaments = pickle.load(f)

In [3]:
keys = list(players.keys())
players[keys[0]]

{'id': 1, 'name': 'Алексей', 'patronymic': None, 'surname': 'Абабилов'}

In [9]:
keys = list(results.keys())
results[keys[2000]]

[{'team': {'id': 3586,
   'name': 'Noname',
   'town': {'id': 63, 'name': 'Волгоград'}},
  'mask': '111111000101110111010111111110100010',
  'current': {'name': 'Noname', 'town': {'id': 63, 'name': 'Волгоград'}},
  'questionsTotal': 24,
  'synchRequest': None,
  'position': 1,
  'controversials': [],
  'flags': [],
  'teamMembers': [{'flag': 'Б',
    'usedRating': 4200,
    'rating': 4208,
    'player': {'id': 41255,
     'name': 'Александр',
     'patronymic': 'Сергеевич',
     'surname': 'Акулиничев'}},
   {'flag': 'Б',
    'usedRating': 3484,
    'rating': 4185,
    'player': {'id': 3276,
     'name': 'Александр',
     'patronymic': 'Сергеевич',
     'surname': 'Беседин'}},
   {'flag': 'К',
    'usedRating': 2763,
    'rating': 4153,
    'player': {'id': 42091,
     'name': 'Юрий',
     'patronymic': 'Александрович',
     'surname': 'Воропаев'}},
   {'flag': 'Б',
    'usedRating': 2066,
    'rating': 4133,
    'player': {'id': 29731,
     'name': 'Константин',
     'patronymic': 'Ал

In [6]:
keys = list(tournaments.keys())
tournaments[keys[2]]

{'id': 3,
 'name': 'Турнир в Ижевске',
 'dateStart': '2003-11-22T00:00:00+03:00',
 'dateEnd': '2003-11-24T00:00:00+03:00',
 'type': {'id': 2, 'name': 'Обычный'},
 'season': '/seasons/2',
 'orgcommittee': [],
 'synchData': None,
 'questionQty': None}

### Подготовка данных
Разбиваем на train и test, в train берем 2019 год, в test - 2020. Выкидываем из рассмотрения данные, где нет поля mask или где оно заполнено некорректно, например в рамках одного турнира для разных команд значение в поле mask имеет разную длину.

In [3]:
train = {}
train_players = {}
test = {}
test_players = {}
for tournament_id in tournaments.keys():
    tournament_data = tournaments[tournament_id]
    year = tournament_data['dateStart'][:4]
    if year not in ['2019', '2020']:
        continue
    filtered_results = []
    skip_tournament = False
    mask_len = None
    for team_data in results[tournament_id]:
        if 'mask' not in team_data:
            continue
        if team_data['mask'] is None:
            continue
        if mask_len is None:
            mask_len = len(team_data['mask'])
        if sum([e not in ['0','1'] for e in team_data['mask']]) > 0:
            skip_tournament = True
            break
        if mask_len != len(team_data['mask']):
            skip_tournament = True
            break
        if len(team_data['teamMembers']) == 0:
            continue
        filtered_results.append(team_data)
    if skip_tournament:
        continue
    if year == '2019':
        for team_data in filtered_results:
            ids = [e['player']['id'] for e in team_data['teamMembers']]
            for id in ids:
                if id in train_players:
                    train_players[id]['games'] += 1
                else:
                    train_players[id] = {'games': 1}
        train[tournament_id] = filtered_results
    if year == '2020':
        test[tournament_id] = filtered_results
        for team_data in filtered_results:
            ids = [e['player']['id'] for e in team_data['teamMembers']]
            for id in ids:
                if id in test_players:
                    test_players[id]['games'] += 1
                else:
                    test_players[id] = {'games': 1}

Отбираем игроков, которые сыграли в более, чем 10 играх, для таких игроков на основе имеющихся данных можно более достоверно предсказывать рейтинг

In [4]:
train_players_ids = [e for e in train_players if train_players[e]['games'] > 10]
len(train_players_ids)

8013

In [5]:
train_players_ids_set = set(train_players_ids)
filtered_train = {}
for tournament_id in train:
    teams = train[tournament_id]
    filtered_teams = []
    for team_data in teams:
        ids = [e['player']['id'] for e in team_data['teamMembers']]
        curr_players = list(set(ids) & train_players_ids_set)
        if len(curr_players) == 0:
            continue
        filtered_teams.append({'mask': team_data['mask'], 'ids': curr_players})
    filtered_train[tournament_id] = filtered_teams

In [28]:
df_players = pd.DataFrame(players).transpose()
df_players

Unnamed: 0,id,name,patronymic,surname
1,1,Алексей,,Абабилов
10,10,Игорь,,Абалов
11,11,Наталья,Юрьевна,Абалымова
12,12,Артур,Евгеньевич,Абальян
13,13,Эрик,Евгеньевич,Абальян
...,...,...,...,...
224700,224700,Артём,Евгеньевич,Садов
224701,224701,Даниил,Олегович,Трефилов
224702,224702,Владимир,Араратович,Басенцян
224703,224703,Руслан,Ринатович,Дауранов


In [78]:
res = {e: {'id': e, 'name': players[e]['name'], 'patronymic': players[e]['patronymic'], 'surname': players[e]['surname'], 'questions': 0, 'right_answers': 0} for e in train_players_ids}
for tournament_id in train:
    teams = train[tournament_id]
    for team in teams:
        right_answers = team['mask'].count('1')
        questions = len(team['mask'])
        for member in team['teamMembers']:
            id = member['player']['id']
            res[id]['questions'] += questions
            res[id]['right_answers'] += right_answers
res

{131077: {'id': 131077,
  'name': 'Паруйр',
  'patronymic': 'Паруйрович',
  'surname': 'Аланакян',
  'questions': 1537,
  'right_answers': 783},
 131080: {'id': 131080,
  'name': 'Иван',
  'patronymic': 'Владимирович',
  'surname': 'Мешков',
  'questions': 598,
  'right_answers': 214},
 131082: {'id': 131082,
  'name': 'Валерия',
  'patronymic': 'Андреевна',
  'surname': 'Кан',
  'questions': 4788,
  'right_answers': 2690},
 131083: {'id': 131083,
  'name': 'Валерий',
  'patronymic': 'Рафаилович',
  'surname': 'Аввакумов',
  'questions': 1445,
  'right_answers': 839},
 15: {'id': 15,
  'name': 'Олег',
  'patronymic': 'Игоревич',
  'surname': 'Абарников',
  'questions': 562,
  'right_answers': 202},
 16: {'id': 16,
  'name': 'Азер',
  'patronymic': 'Абасали оглы',
  'surname': 'Абасалиев',
  'questions': 234,
  'right_answers': 129},
 23: {'id': 23,
  'name': 'Андрей',
  'patronymic': 'Николаевич',
  'surname': 'Абащенко',
  'questions': 36,
  'right_answers': 18},
 131098: {'id': 13109

In [79]:
df_players_train = pd.DataFrame(res).transpose()
df_players_train

Unnamed: 0,id,name,patronymic,surname,questions,right_answers
131077,131077,Паруйр,Паруйрович,Аланакян,1537,783
131080,131080,Иван,Владимирович,Мешков,598,214
131082,131082,Валерия,Андреевна,Кан,4788,2690
131083,131083,Валерий,Рафаилович,Аввакумов,1445,839
15,15,Олег,Игоревич,Абарников,562,202
...,...,...,...,...,...,...
208409,208409,Дмитрий,Александрович,Ашуров,288,63
131055,131055,Полина,Игоревна,Трощенко,90,11
208410,208410,Артём,Дмитриевич,Ворошилов,288,63
131061,131061,Владимир,Михайлович,Воронов,561,209


In [6]:
def compare_masks(mask1, mask2):
    first_better = 0
    equals = 0
    count = min(len(mask1), len(mask2))
    for i in range(count):
        first_better += mask1[i] > mask2[i]
        equals += mask1[i] == mask2[i]
    return (count, first_better, equals)

Введем $\gamma_k$ - рейтинги игроков, такие что:
p(i>j) = $\frac{\gamma_i}{\gamma_i + \theta\gamma_j}$ - вероятность того, что i-ый игрок ответит правильно, а j-ый нет, тогда p(i=j) = 1 - $\frac{\gamma_i}{\gamma_i + \theta\gamma_j} - \frac{\gamma_j}{\gamma_j + \theta\gamma_i}$ - вероятность, что либо оба ответят правильно, либо оба ответят неправильно

In [30]:
res = {}
for tournament_id in filtered_train:
    teams = filtered_train[tournament_id]
    n_teams = len(teams)
    for idx1 in range(n_teams):
        for idx2 in range(idx1 + 1, n_teams):
            questions, first_better, equals = compare_masks(teams[idx1]['mask'], teams[idx2]['mask'])
            for id1 in teams[idx1]['ids']:
                for id2 in teams[idx2]['ids']:
                    key1 = (id1, id2)
                    key2 = (id2, id1)
                    if key1 in res:
                        res[key1]['count'] += questions
                        res[key1]['i>j'] += first_better
                        res[key1]['i=j'] += equals
                    elif key2 in res:
                        res[key2]['count'] += questions
                        res[key2]['i>j'] += questions - first_better - equals
                        res[key2]['i=j'] += equals
                    else:
                        res[key1] = {'id1': id1, 'id2': id2, 'count': questions, 'i>j': first_better, 'i=j': equals}

res

{(15456, 40840): {'id1': 15456,
  'id2': 40840,
  'count': 390,
  'i>j': 103,
  'i=j': 262},
 (15456, 16206): {'id1': 15456,
  'id2': 16206,
  'count': 246,
  'i>j': 65,
  'i=j': 166},
 (15456, 1584): {'id1': 15456,
  'id2': 1584,
  'count': 327,
  'i>j': 75,
  'i=j': 229},
 (15456, 1585): {'id1': 15456,
  'id2': 1585,
  'count': 387,
  'i>j': 50,
  'i=j': 277},
 (15456, 10998): {'id1': 15456,
  'id2': 10998,
  'count': 318,
  'i>j': 77,
  'i=j': 221},
 (6212, 40840): {'id1': 6212,
  'id2': 40840,
  'count': 687,
  'i>j': 181,
  'i=j': 456},
 (6212, 16206): {'id1': 6212,
  'id2': 16206,
  'count': 430,
  'i>j': 110,
  'i=j': 286},
 (6212, 1584): {'id1': 6212,
  'id2': 1584,
  'count': 471,
  'i>j': 124,
  'i=j': 314},
 (6212, 1585): {'id1': 6212, 'id2': 1585, 'count': 635, 'i>j': 84, 'i=j': 465},
 (6212, 10998): {'id1': 6212,
  'id2': 10998,
  'count': 543,
  'i>j': 143,
  'i=j': 361},
 (26089, 40840): {'id1': 26089,
  'id2': 40840,
  'count': 184,
  'i>j': 35,
  'i=j': 135},
 (26089, 

Зафиксируем $\theta$ и будем итеративно подбирать $\gamma_k$ Команды в данном случае не рассматриваются, все игроки рассматриваются независимо

In [8]:
teta = 1.1
gamma = {id: 1./len(train_players_ids) for id in train_players_ids}
for _ in range(1):
    gamma_new = {id: {'wins': 0, 'sum1': 0, 'sum2': 0, 'count': 0} for id in train_players_ids}
    for r in res:
        row = res[r]
        id1 = row['id1']
        id2 = row['id2']
        gamma_new[id1]['count'] += row['count']
        gamma_new[id2]['count'] += row['count']
        gamma_new[id1]['wins'] += row['i>j']
        gamma_new[id2]['wins'] += row['count'] - row['i>j'] - row['i=j']
        gamma_new[id1]['sum1'] += row['i>j'] / (gamma[id1] + teta * gamma[id2])
        gamma_new[id2]['sum1'] += (row['count'] - row['i>j'] - row['i=j']) / (gamma[id2] + teta * gamma[id1])
        gamma_new[id1]['sum2'] += (row['count'] - row['i>j'] - row['i=j']) * teta / (gamma[id2] + teta * gamma[id1])
        gamma_new[id2]['sum2'] += row['i>j'] * teta / (gamma[id1] + teta * gamma[id2])
    s = 0.0
    for k in gamma:
        gamma[k] = gamma_new[k]['wins'] / (gamma_new[id1]['sum1'] + gamma_new[id1]['sum2'])
        s += gamma[k]
    for k in gamma:
        gamma[k] /= s

In [9]:
from scipy import stats

In [31]:
n_tournaments = 0
correlation_spearmanr = 0.0
correlation_kendall = 0.0
for tournament_id in list(test.keys()):
    tournament = test[tournament_id]
    if len(tournament) == 0:
        continue
    real_order = []
    predicted_order = []
    unknown = True
    for team_data in tournament:
        real_order.append(team_data['position'])
        score = 0.0
        count = 0
        for member in team_data['teamMembers']:
            id = member['player']['id']
            if id in gamma:
                score += gamma[id]
                count += 1
        if count == 0:
            predicted_order.append(0)
        else:
            unknown = False
            predicted_order.append(score / count)
    if unknown or len(predicted_order) <= 1:
        continue
    predicted_order.reverse()
    correlation_spearmanr += stats.spearmanr(predicted_order, real_order).correlation
    tau, _ = stats.kendalltau(predicted_order, real_order)
    correlation_kendall += tau
    n_tournaments += 1
print(f'Spearmanr correlation: {correlation_spearmanr / n_tournaments}')
print(f'Kendall correlation: {correlation_kendall / n_tournaments}')

Spearmanr correlation: 0.6222815643575041
Kendall correlation: 0.4772904777018144


Попробуем учесть команды и сложности вопросов

In [11]:
from scipy.optimize import minimize

Для удобства соберем dict с информацией по игрокам в каких играх и командах он играл

In [12]:
players_games = {}
for idx in filtered_train:
    teams = filtered_train[idx]
    for team_idx in range(len(teams)):
        team = teams[team_idx]
        for player_id in team['ids']:
            if player_id in players_games:
                players_games[player_id].append({'game_id': idx, 'team_idx': team_idx})
            else:
                players_games[player_id] = [{'game_id': idx, 'team_idx': team_idx}]

В данных есть случаи, когда какие-то игроки всегда играли только в одних и тех же командах, в таком случае у нас нет данных, для того, чтобы отличить таких игроков по рейтингу, поэтому будем считать их "одним" игроком

In [13]:
similar_players = {}
for player_id in players_games:
    games = players_games[player_id]
    game_ids = tuple(sorted([e['game_id'] for e in games]))
    if game_ids in similar_players:
        similar_players[game_ids].append(player_id)
    else:
        similar_players[game_ids] = [player_id]
        
for k in similar_players:
    similar_players[k] = sorted(similar_players[k])

In [14]:
train_players_ids = [sorted(similar_players[k])[0] for k in similar_players]

Инициализируем сложности вопросов и рейтинги игроков какими-то начальными значениями, например можно инициализировать сложности вопросов по тому какой процент команд в данном турнире на него ответил. Далее будем поэтапно подбирать параметры сложности вопросов и значения рейтингов. Считаем рейтинг заданной команды как средний по рейтингу игроков, ошибку считаем на основе разности между рейтингом и сложностью вопроса и тем правильный был ответ или нет. На каждом шаге пытаемся минимизировать ошибку

In [15]:
players_scores = {k: random.random() for k in train_players_ids}

In [16]:
def get_player_score(player_id, players_data):
    if player_id not in train_players_ids:
        return None
    return players_data[player_id]

def get_team_score(team, players_data):
    s = 0.0
    ii = 0
    for curr_id in team['ids']:
        score = get_player_score(curr_id, players_data)
        if score is not None:
            s += score
            ii += 1
    if ii == 0:
        return (None, None)
    return (s, ii)

for idx in filtered_train:
    teams = filtered_train[idx]
    for team in teams:
        team['score'] = get_team_score(team, players_scores)

In [17]:
question_scores = {}
for idx in filtered_train:
    games = filtered_train[idx]
    if len(games) == 0:
        continue
    l = len(games[0]['mask'])
    #question_scores[idx] = [random.random() for k in range(l)]
    question_scores[idx] = []
    for k in range(l):
        s = ''
        for i in range(len(games)):
            s += games[i]['mask'][k]
        question_scores[idx].append(float(s.count('1')) / l)

In [18]:
def get_optim_game_score(game_id, question_idx):
    teams = filtered_train[game_id]
    teams_scores = []
    for team in teams:
        s, ii = team['score']
        if s is None:
            teams_scores.append(None)
            continue
        teams_scores.append(s / ii)
    def count_error(score):
        error = 0.0
        for idx in range(len(teams)):
            team = teams[idx]
            team_score = teams_scores[idx]
            if team_score is None:
                continue
            if team['mask'][question_idx] == '0':
                if team_score > score:
                    error += team_score - score
            else:
                if team_score < score:
                    error += score - team_score
        return error
    res = minimize(count_error, question_scores[game_id][question_idx], method='BFGS', options={'maxiter': 1})
    return res.x[0]

def get_optim_player_score(player_id):
    games = players_games[player_id]
    def count_error(score):
        error = 0.0
        for game in games:
            game_id = game['game_id']
            team_idx = game['team_idx']
            teams = filtered_train[game_id]
            team = teams[team_idx]
            team_score = team['score']
            if team_score is None:
                continue
            s, ii = team_score
            s -= players_scores[player_id]
            team_score = (s + score) / ii
            for idx in range(len(team['mask'])):
                question_res = team['mask'][idx]
                question_score = question_scores[game_id][idx]
                if question_res == '0':
                    if team_score > question_score:
                        error += team_score - question_score
                else:
                    if team_score < question_score:
                        error += question_score - team_score
        return error
    res = minimize(count_error, players_scores[player_id], method='BFGS', options={'maxiter': 1})
    return res.x[0]

In [32]:
for _ in range(1):
    for game_id in question_scores:
        questions = question_scores[game_id]
        for idx in range(len(questions)):
            question = questions[idx]
            score = get_optim_game_score(game_id, idx)
            questions[idx] = score
    for idx in range(len(train_players_ids)):
        games = players_games[player_id]
        game_ids = tuple(sorted([e['game_id'] for e in games]))
        player_id = train_players_ids[idx]
        old_score = players_scores[player_id]
        players_scores[player_id] = get_optim_player_score(player_id)
        for game in players_games[train_players_ids[idx]]:
            team = filtered_train[game['game_id']][game['team_idx']]
            s, ii = team['score']
            if s is None:
                continue
            team['score'] = (s - old_score + players_scores[player_id], ii)

Расчет корреляций для тестовой выборки

In [34]:
n_tournaments = 0
correlation_spearmanr = 0.0
correlation_kendall = 0.0
for tournament_id in list(test.keys()):
    tournament = test[tournament_id]
    if len(tournament) == 0:
        continue
    real_order = []
    predicted_order = []
    unknown = True
    for team_data in tournament:
        real_order.append(team_data['position'])
        score = 0.0
        count = 0
        for member in team_data['teamMembers']:
            id = member['player']['id']
            if id in train_players_ids:
                score += players_scores[id]
                count += 1
        if count == 0:
            predicted_order.append(0)
        else:
            unknown = False
            predicted_order.append(score / count)
    if unknown or len(predicted_order) <= 1:
        continue
    predicted_order.reverse()
    correlation_spearmanr += stats.spearmanr(predicted_order, real_order).correlation
    tau, _ = stats.kendalltau(predicted_order, real_order)
    correlation_kendall += tau
    n_tournaments += 1
print(f'Spearmanr correlation: {correlation_spearmanr / n_tournaments}')
print(f'Kendall correlation: {correlation_kendall / n_tournaments}')

Spearmanr correlation: 0.6325773318514789
Kendall correlation: 0.4902036866538402
