In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

DIFF_RATING_OUTPUT_NAME = 'diff_rating_pred_data.csv'
SAME_RATING_OUTPUT_NAME = 'same_rating_pred_data.csv'

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
#Load problem metadata
problems = pd.read_csv("../../ChessPuzzleEmbeddings/optimal_meta.tsv",sep='\t')

In [3]:
#Read the attempts dataset and filter out users that have played less than 1k games
attempts = pd.read_csv("/w/225/1/chess/tactics/glicko_user_tactics_problem.csv_00")
attempts = attempts.loc[attempts['userGamesPlayed'] >= 1000]

In [4]:
#Assign rating labels to problems in metadata so that problems 
#with ratings within the same range have the same label
start = 375
end = 3025
interval = 25
problems['rating_labels'] = pd.cut(problems['rating'],np.arange(start,end,interval),labels=range(1,((end-start)//interval)))
#Merge problem metadata to join cluster and rating labels to the attempts by puzzle ID
attempts=pd.merge(attempts,problems[['tactics_problem_id','cluster','rating_labels']],on='tactics_problem_id', how='inner')
#Get list of individual users
users = list(set(attempts['user_hash'].values))

In [6]:
#Check if the two puzzles, given by their ID belong to the same cluster or not
def test_tuple(tup):
    prob_1 = problems.loc[problems['tactics_problem_id'] == tup[0]]['cluster'].values[0]
    prob_2 = problems.loc[problems['tactics_problem_id'] == tup[1]]['cluster'].values[0]
    return prob_1 != prob_2
#Check if the two puzzles, given by their ID belong to the same cluster or not
def test_rating_labels(tup):
    prob_1 = problems.loc[problems['tactics_problem_id'] == tup[0]]['rating_labels'].values[0]
    prob_2 = problems.loc[problems['tactics_problem_id'] == tup[1]]['rating_labels'].values[0]
    return prob_1 != prob_2

In [7]:
#Filter out users that have played less than cluster_games_played games in each cluster
cluster_games_played = 100
user_cluster_count = attempts.groupby(['user_hash','cluster'])['tactics_problem_id'].count().to_frame()
user_cluster_count.rename(columns={'tactics_problem_id':'cluster_count'},inplace=True)
attempts  = pd.merge(attempts,user_cluster_count,on=['user_hash','cluster'],how='inner')
attempts = attempts.loc[attempts.cluster_count > cluster_games_played]
attempts.shape

(67259, 18)

<h2>Pass/Fail Sampling & Dataset Creation</h2>
<p>First cell below to create dataset of different ratings, second to create same rating</p>

In [None]:
#Creation of different rating dataset
from itertools import product
import random
id_table = pd.DataFrame()
i = 0
for user in tqdm(users):
    print('User: ' + user)
    temp = []
    user_list = attempts.loc[attempts["user_hash"] == user]
    #List of rating range labels
    labels = list(set(user_list['rating_labels'].dropna()))
    success = list(user_list.loc[user_list["is_passed"] == 1]['tactics_problem_id'])
    failed = list(user_list.loc[user_list["is_passed"] == 0]['tactics_problem_id'])
    #Get the correct porportion of passed/failed combinations for this rating label
    combs = random.sample(set(product(success,failed)),min(len(success),len(failed)))
    j = 0
    while len(temp) < 200 and j < len(combs):
        comb = combs[j]
        if test_rating_labels(comb) and comb not in temp:
            id_table=id_table.append(pd.Series([user,comb[0],comb[1]]),ignore_index=True)
            temp.append(comb)
        j += 1
    i += 1
    if i % 10 == 0:
        id_table.to_csv(DIFF_RATING_OUTPUT_NAME, mode='w', header=False,index=False)

In [None]:
#Creation of the same rating dataset
from itertools import product
import random
id_table = pd.DataFrame()
i = 0
for user in tqdm(users):
    print('User: ' + user)
    temp = []
    user_list = attempts.loc[attempts["user_hash"] == user]
    n = len(user_list)
    #List of rating range labels
    labels = list(set(user_list['rating_labels'].dropna()))
    #print('doing user: ' + user)
    for val in labels:
        porp = int(porps.loc[val]*n)
        #Get all problems completed by the user with this rating label
        probs = user_list.loc[user_list['rating_labels'] == val]
        success = list(probs.loc[probs["is_passed"] == 1]['tactics_problem_id'])
        failed = list(probs.loc[probs["is_passed"] == 0]['tactics_problem_id'])
        combs = random.sample(set(product(success,failed)),min(len(success),len(failed)))
        for comb in combs:
            if test_tuple(comb) and comb not in temp:
                id_table=id_table.append(pd.Series([user,comb[0],comb[1]]),ignore_index=True)
                temp.append(comb)
    i += 1
    #if i % 10 == 0:
    #    id_table.to_csv(SAME_RATING_OUTPUT_NAME, mode='w', header=False,index=False)

<h2>Clean Data and prepare for Prediction Task</h2>

In [9]:
#Rename the columns of the collected puzzle pairs
id_table = id_table.rename(columns={0:'user_hash',1:'puzzle_1',2:'puzzle_2'})
id_table = id_table.astype({'puzzle_1': 'int32','puzzle_2':'int32'})

In [10]:
#Join the puzzle rating and cluster labels for both puzzles as well as the rating labels
id_table = pd.merge(id_table,problems[['tactics_problem_id','cluster','rating','tag']],left_on='puzzle_1', right_on='tactics_problem_id',how='inner')
id_table = id_table.rename(columns={'cluster':'cluster_1','rating':'rating_1','tag':'tag_1'}).drop('tactics_problem_id',axis=1)
id_table = pd.merge(id_table,problems[['tactics_problem_id','cluster','rating','tag']],left_on='puzzle_2', right_on='tactics_problem_id',how='inner')
id_table = id_table.rename(columns={'cluster':'cluster_2','rating':'rating_2','tag':'tag_2'}).drop('tactics_problem_id',axis=1)

In [11]:
#Randomly flip the rows so that if the passed_puzzle = 0, the puzzle_1 and corresponding fields
#belong to the passsed puzzle
def flip(row,field):
    name_1 = field  + '_1'
    name_2 = field + '_2'
    temp = row[name_2]
    row[name_2] = row[name_1]
    row[name_1] = temp
    return row
    

def flip_fields(row):
    if row['passed_puzzle'] == 1:
        row = flip(row,'cluster')
        row = flip(row,'rating')        
        row = flip(row,'tag')
        row = flip(row,'puzzle')
    return row

id_table['passed_puzzle'] = np.full(id_table.shape[0],0)

def switch_win(row):
    num = random.uniform(0,1)
    if num > 0.5:
        row['passed_puzzle'] = 1
        row = flip_fields(row)
    return row
        
id_table = id_table.progress_apply(lambda x: switch_win(x),axis=1)

HBox(children=(IntProgress(value=0, max=7701), HTML(value='')))




In [12]:
id_table.head()

Unnamed: 0,user_hash,puzzle_1,puzzle_2,cluster_1,rating_1,tag_1,cluster_2,rating_2,tag_2,passed_puzzle
0,ee0e44518695a3481a3ed59b545c4719d54e894fb3d5e0...,35635,28567,4,1151,Overloading,5,1175,Fork / Double Attack,0
1,01098357c866093611fa8a576bb5bade948458bdd8dd39...,28567,36085,5,1175,Fork / Double Attack,2,1168,Mate in 3+,1
2,01098357c866093611fa8a576bb5bade948458bdd8dd39...,28567,27327,5,1175,Fork / Double Attack,2,1167,Pin,1
3,9adee4e89a43a5f412bb4227433937655e99a11426c109...,28567,32975,5,1175,Fork / Double Attack,2,1161,Pin,1
4,9adee4e89a43a5f412bb4227433937655e99a11426c109...,34782,28567,2,1164,Overloading,5,1175,Fork / Double Attack,0
