In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

# <h2>Exact Duplicates Investigation</h2>

In [None]:
tactics_data = pd.read_csv("modified tactics data")
tactics_data

In [None]:
#tactics_data = pd.read_csv('/w/225/1/chess/tactics/tactics_problem.csv')
#tactics_data

In [None]:
#a function to extract ONLY the field indicated by parameter if it exists. Otherwise, it's skipped
def extract(st, parameter):
    splitted_st = st.split("\\n") #splitted strings
    existence = 0
    for string in splitted_st:
        if parameter in string: #if we find the section which has parameter in it (as the indicator)
            target_st = string #take it as our target string
            existence = 1 #to denote that the parameter field indeed exists
            break
    if existence == 1: #if the parameter field exists, clean it up so that we get only the relevant information. Otherwise, skip
        target_st = target_st.replace(parameter, '')
        target_st = target_st.replace('[','')
        target_st = target_st.replace(']','')
        target_st = target_st.replace('"','')
        return target_st #return the cleaned target string if the parameter field exists
    return "N/A" #return N/A if the parameter field does not exist

In [None]:
#Create a FULL column, append it to the tactics_data. Repeat this same process for FEN, Event, and so on.


tactics_data['FULL'] = 'N/A'
tactics_data['FEN'] = 'N/A'
tactics_data['Event'] = 'N/A'
tactics_data['Site'] = 'N/A'
tactics_data['Date'] = 'N/A'
tactics_data['Round'] = 'N/A'
tactics_data['White'] = 'N/A'
tactics_data['Black'] = 'N/A'
tactics_data['Result'] = 'N/A'
tactics_data['FirstMove'] = 'N/A'
tactics_data['PlyCount'] = 'N/A'
tactics_data['SetUp'] = 'N/A' 

row, col = tactics_data.shape
puzz_id = list(tactics_data['tactics_problem_id'])
for i in tqdm(range(row)):
    pos = puzz_id.index(tactics_data.iloc[i,0])
    pgn = tactics_data.iloc[pos,7]
    tactics_data.iloc[i,8] = extract(pgn, "FULL")
    tactics_data.iloc[i,9] = extract(pgn, "FEN")
    tactics_data.iloc[i,10] = extract(pgn, "Event")
    tactics_data.iloc[i,11] = extract(pgn, "Site")
    tactics_data.iloc[i,12] = extract(pgn, "Date")
    tactics_data.iloc[i,13] = extract(pgn, "Round")
    tactics_data.iloc[i,14] = extract(pgn, "White")
    tactics_data.iloc[i,15] = extract(pgn, "Black")
    tactics_data.iloc[i,16] = extract(pgn, "Result")
    tactics_data.iloc[i,17] = extract(pgn, "FirstMove")
    tactics_data.iloc[i,18] = extract(pgn, "PlyCount")
    tactics_data.iloc[i,19] = extract(pgn, "SetUp")

In [None]:
#A list of ALL the cleaned FEN and cleaned FULL fields in the form of (FEN, FULL) tuple, EXCLUDING N/A's.

full_list_ori = np.array(tactics_data['FULL'])
fen_list_ori = np.array(tactics_data['FEN'])
fen_full_list_cleaned = []
for i in range(len(full_list_ori)):
    if full_list_ori[i] != 'N/A' and fen_list_ori[i] != 'N/A':
        fen_full_list_cleaned.append((fen_list_ori[i], full_list_ori[i]))

In [None]:
#fen_full_to_ids is a dictionary that maps (FEN, FULL) tuple to ALL OF ITS CORRESPONDING PROBLEM IDs,
#which are colleted in a list

fen_full_to_ids = {}
for fen_full in fen_full_list_cleaned:
    fen_full_to_ids[fen_full] = []
    
r,c = tactics_data.shape
for i in tqdm(range(r)):
    fen = tactics_data.iloc[i,9]
    full = tactics_data.iloc[i,8]
    prob_id = tactics_data.iloc[i,0]
    if full != "N/A" and fen != "N/A":
        fen_full_to_ids[(fen,full)].append(prob_id)

In [None]:
#dup_fen_full_to_ids is a subset of fen_full_to_ids. Here, only ids which have at least 1 duplicate are selected.
dup_fen_full_to_ids = {}

for pair in fen_full_to_ids:
    if len(fen_full_to_ids[pair]) > 1:
        dup_fen_full_to_ids[pair] = fen_full_to_ids[pair]

In [None]:
dup_fen_full_to_ids

In [None]:
#duplicate_fen_full_id_to_id is a list that pairs an ID to another (FEN,FULL) duplicate ID in the form of tuple. If there are more than two
#problems which are duplicate to each other, we form every possible 2-way pair of all the IDs.
duplicate_fen_full_id_to_id = []
#target_problems is a list of all the problems which have at least one duplicate and so 
#we have to assess for their (reciprocal) rank
target_problems = []
#dup_fen_full_list is a list that pairs FEN and FULL in the form of (FEN, FULL) tuple, but ONLY those which have duplicates.
dup_fen_full_list = []
#group is a list to denote the "group" of each problem id. Here, group denotes which ones are duplicate with each other
#Ex:
#752 -> group 1
#765 -> group 1
#This means 752 and 765 are exact duplicates (exactly same FEN and FULL) with each other
group = []
#ind is just index for the group list.
ind = 0

for fen_full in dup_fen_full_to_ids: #for every (FEN,FULL) field
    prob_id_list = dup_fen_full_to_ids[fen_full] #get the list of corresponding problem IDs
    target_problems += prob_id_list #add that ID to the target_problems list
    ind += 1
    #create all pairs of IDs in the list
    for k in range(len(prob_id_list)):
        group.append(ind)
        dup_fen_full_list.append(fen_full)
        for j in range(len(prob_id_list)):
            if prob_id_list[k] != prob_id_list[j]:
                duplicate_fen_full_id_to_id.append((prob_id_list[k],prob_id_list[j]))

In [None]:
dup_fen_full_list

In [None]:
new_table_fen_full_pair = tactics_data.set_index("tactics_problem_id", inplace = False)
new_table_fen_full_pair = new_table_fen_full_pair.loc[target_problems]
new_table_fen_full_pair.insert(0, "(FEN,FULL)", dup_fen_full_list)
new_table_fen_full_pair.insert(0, "Group", group)

In [None]:
new_table_fen_full_pair.to_csv("new_tactics_problem.csv")

In [None]:
temp = pd.read_csv("new_tactics_problem.csv")
temp

# <h2> Summary Statistics of Dates for Each Group </h2>

In [None]:
group_date = temp[['Group','Date','tactics_problem_id']]
group_date

In [None]:
#Drop the rows which have incomplete dates because we can't use those kind of information anyway.
#Also, for the rest of the dates, convert the format from YYYY.MM.DD to YYYY-MM-DD

from datetime import date as dt
r,c = group_date.shape

for i in range(r):
    st = str(group_date.iloc[i,1])
    [year, month, date] = st.split('.')
    
    if year != '????' and month != '??' and date != '??':
        group_date.iloc[i,1] = dt(int(year), int(month), int(date))
    else:
        group_date.iloc[i,1] = 'N/A'
        
group_date.drop(group_date[(group_date['Date'] == "N/A")].index, inplace = True)
group_date

In [None]:
#Group the group_date dataframe by the Group and find the mean, standard deviation, min, max, and range of the dates.
#Note that the Date is converted to integer by counting how many seconds from that date to present.

group_date['Date_int'] = pd.to_datetime(group_date['Date']).astype(int)
res = group_date.groupby('Group').agg(['mean', 'std', 'min', 'max'])
res.columns = ['_'.join(c) for c in res.columns.values]

res['Date_mean'] = pd.to_datetime(res['Date_int_mean'])
res['Date_std'] = pd.to_timedelta(res['Date_int_std'])
res['Date_min'] = pd.to_datetime(res['Date_int_min'])
res['Date_max'] = pd.to_datetime(res['Date_int_max'])

res = res[['Date_mean', 'Date_std', 'Date_min', 'Date_max']]
res['Date_range'] = (res['Date_max'] - res['Date_min']).dt.days

In [None]:
res

In [None]:
#Remember that earlier we eliminated rows which have invalid/incomplete dates.
#This procedure results in some groups having only 1 date. We denote these groups as invalid_groups below
#Why invalid? Because when we execute res[res['Date_range'] == 0], we want to find groups which actually
#have DUPLICATE (AND still VALID) dates. Having only 1 date obviously results in range = 0 too, so we want
#to eliminate these groups. 
res[res['Date_range'] == 0]

In [None]:
invalid_groups = [13,42,44,51,59,105,107,122,142,197]
#all_groups is a list of all groups WITHOUT REPETITION
all_groups = list(np.unique(np.array(group_date['Group'])))
#valid_groups are the desired groups, which are already filtered out so that it does not contain groups which
#are listed in invalid_groups.
valid_groups = []
for group in all_groups:
    if not(group in invalid_groups):
        valid_groups.append(group)
    else:
        group_date = group_date[group_date['Group'] != group]

res = pd.DataFrame(res, index = valid_groups)
res.to_csv("statistics summary date creation.csv")
group_date.to_csv("group_date.csv")

In [None]:
res

In [None]:
res[res['Date_range'] == 0] #We want to try to investigate these groups later on

In [None]:
new = group_date.pivot(values = 'Date_int', index = None, columns = 'Group')
#new
boxplot = new.boxplot(column = valid_groups, figsize = (30,10))
boxplot

In [None]:
ax1 = group_date.plot.scatter(x='Group', y='Date',c='Blue', figsize = (20,10))
ax1

# <h2> Further Investigation for Groups with Member Size > Threshold </h2>

In [None]:
#Create a list of (length, group) tuples where length is size of the group and group is the corresponding group number.
#We only take those which have length > threshold

threshold = 1

member_sorted = []
for pair in dup_fen_full_to_ids:
    length = len(dup_fen_full_to_ids[pair])
    if length > threshold:
        pos = dup_fen_full_list.index(pair)
        group = temp.iloc[pos,1]
        if group in valid_groups:
            member_sorted.append((length,group))
            
member_sorted.sort(reverse = True)
member_sorted

In [None]:
#Put the sorted groups in group_large_member_sorted, and the lengths in size_large_member_sorted
group_large_member_sorted = []
size_large_member_sorted = []
for pair in member_sorted:
    group_large_member_sorted.append(pair[1])
    size_large_member_sorted.append(pair[0])

In [None]:
boxplot_large_member = new.boxplot(column = group_large_member_sorted, figsize = (30,10))
boxplot_large_member

In [None]:
#Create a dictionary that maps each group to its corresponding standard deviation of dates in seconds, and another
#dictionary that maps each group to its corresponding standard deviation of dates in days
group_list = list(res.index)
group_to_std_seconds = {}
group_to_std_days = {}
for i in range(len(group_list)):
    group_to_std_seconds[group_list[i]] = res.iloc[i,1].total_seconds()
    group_to_std_days[group_list[i]] = res.iloc[i,1].days

In [None]:
data_large_member = pd.DataFrame({'Group': group_large_member_sorted, 'Member Size': size_large_member_sorted})
data_large_member['Std in Seconds'] = np.vectorize(group_to_std_seconds.get)(list(data_large_member['Group']))
data_large_member['Std in Days'] = np.vectorize(group_to_std_days.get)(list(data_large_member['Group']))
data_large_member.to_csv("data_large_member.csv", index = False)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

plt.figure(figsize=(20,10))
sns.set(style="whitegrid")
ax = sns.barplot(x="Group", y="Member Size", order = group_large_member_sorted, data=data_large_member)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

plt.figure(figsize=(20,10))
sns.set(style="whitegrid")
ax = sns.barplot(x="Group", y="Std in Seconds", order = group_large_member_sorted, data=data_large_member)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

plt.figure(figsize=(20,10))
sns.set(style="whitegrid")
ax = sns.barplot(x="Group", y="Std in Days", order = group_large_member_sorted, data=data_large_member)

# <h2> Finding Pearson Correlation Between Standard Deviation and Cosine Distances (ONLY Successful Attempts)
</h2>

In [None]:
df = temp[temp['Group'].isin(group_large_member_sorted)]
problem_id_dup = df['tactics_problem_id']

In [None]:
embedding = pd.read_csv("/h/224/stevenhl/chess_dim_skill/src/word2vecf/vectors/size_alpha/vecs_0.16_265.txt", sep=' ', header=None, skiprows=1)
embedding = embedding.drop(len(embedding.columns)-1,axis=1)
problem_id = list(embedding[0])
embedding.rename(columns={0:'puzzle_id'}, inplace=True)
embedding.set_index('puzzle_id',inplace=True)
embedding = embedding.divide(np.linalg.norm(embedding, axis=1), axis=0)

In [None]:
problem_filtered = []
for problem in problem_id_dup:
    if problem in problem_id:
        problem_filtered.append(problem)
df = df[df['tactics_problem_id'].isin(problem_filtered)]
df

In [None]:
import sklearn.metrics 
def generate_matrix(sampled_problems):
    vecs = embedding.loc[sampled_problems]
    cos_sim = sklearn.metrics.pairwise.cosine_similarity(vecs)
    cos_dist = 1.0 - cos_sim
    #Set really small numbers to 0
    cos_dist=cos_dist.round(3)
    #Create cosine distance table
    table = pd.DataFrame(cos_dist, columns=sampled_problems,index=sampled_problems)
    return table

table = generate_matrix(problem_filtered)
table

In [None]:
all_groups = list(np.unique(np.array(df['Group'])))
group_to_id_dup = {}

for group in all_groups:
    group_to_id_dup[group] = []

r,c = df.shape
for i in range(r):
    group_to_id_dup[df.iloc[i,1]].append(df.iloc[i,0])

group_to_id_dup

In [None]:
new_group_to_id_dup = {}
new_all_groups = []
for group in group_to_id_dup:
    if len(group_to_id_dup[group]) > threshold:
        new_group_to_id_dup[group] = group_to_id_dup[group]
        new_all_groups.append(group)

In [None]:
avg_cos_dist = []
for group in new_group_to_id_dup:
    problem_list = new_group_to_id_dup[group]
    cos_dist = 0
    count = 0
    for i in range(len(problem_list)):
        for j in range(i+1,len(problem_list)):
            cos_dist += table.loc[problem_list[i], problem_list[j]]
            count += 1
    avg_cos_dist.append(cos_dist/count)

In [None]:
new_group_to_id_dup

In [None]:
std_list = []
for group in new_all_groups:
    if group_to_std_days[group] == 0: print(group)
    std_list.append(group_to_std_days[group])

In [None]:
new_all_groups

In [None]:
std_list

In [None]:
corr = np.corrcoef(std_list,avg_cos_dist)
corr[0,1]

# <h2> Finding Pearson Correlation Between Dates Difference and Cosine Distances (ONLY Successful Attempts) </h2>

In [None]:
id_date = df[['tactics_problem_id','Date']]

id_to_date = {}
r,c = id_date.shape
for i in range(r):
    id_to_date[id_date.iloc[i,0]] = id_date.iloc[i,1]

In [None]:
import datetime
def diff_days(date1,date2):
    date1 = datetime.datetime.strptime(date1, " %Y.%m.%d").date()
    date2 = datetime.datetime.strptime(date2, " %Y.%m.%d").date()
    return abs((date1 - date2).days)

def check(date1,date2):
    [y1,m1,d1] = date1.split(".")
    [y2,m2,d2] = date2.split(".")
    if y1 != '????' and m1 != '??' and d1 != '??' and y2 != '????' and m2 != '??' and d2 != '??':
        return True
    return False

In [None]:
diff_date_avg = []
cos_dist_avg = []
diff_date_complete = []
cos_dist_complete = []
for group in new_group_to_id_dup:
    problem_list = new_group_to_id_dup[group]
    diff = 0
    count = 0
    cos_dist = 0
    for i in range(len(problem_list)):
        for j in range(i+1, len(problem_list)):
            if check(id_to_date[problem_list[i]], id_to_date[problem_list[j]]) == True:
                diff += diff_days(id_to_date[problem_list[i]], id_to_date[problem_list[j]])
                cos_dist += table.loc[problem_list[i], problem_list[j]]
                diff_date_complete.append(diff_days(id_to_date[problem_list[i]], id_to_date[problem_list[j]]))
                cos_dist_complete.append(table.loc[problem_list[i], problem_list[j]])
                count += 1
    diff_date_avg.append(diff/count)
    cos_dist_avg.append(cos_dist/count)

In [None]:
corr = np.corrcoef(diff_date_avg,cos_dist_avg)
corr[0,1]

In [None]:
corr = np.corrcoef(diff_date_complete,cos_dist_complete)
corr[0,1]

# <h2> Finding Pearson Correlation Between Rating and Cosine Distances (ONLY Successful Attempts) </h2>

In [None]:
df = temp[['tactics_problem_id','Group','rating']]
df = df[temp['Group'].isin(group_large_member_sorted)]

id_to_rating = {}
r,c = df.shape
for i in range(r):
    id_to_rating[df.iloc[i,0]] = df.iloc[i,2]

In [None]:
diff_rating_avg = []
cos_dist_avg = []
diff_rating_complete = []
cos_dist_complete = []
for group in new_group_to_id_dup:
    problem_list = new_group_to_id_dup[group]
    diff = 0
    count = 0
    cos_dist = 0
    for i in range(len(problem_list)):
        for j in range(i+1, len(problem_list)):
            diff += abs(id_to_rating[problem_list[i]]-id_to_rating[problem_list[j]])
            cos_dist += table.loc[problem_list[i], problem_list[j]]
            diff_rating_complete.append(abs(id_to_rating[problem_list[i]]-id_to_rating[problem_list[j]]))
            cos_dist_complete.append(table.loc[problem_list[i], problem_list[j]])
            count += 1
    diff_rating_avg.append(diff/count)
    cos_dist_avg.append(cos_dist/count)

In [None]:
corr = np.corrcoef(diff_rating_avg,cos_dist_avg)
corr[0,1]

In [None]:
corr = np.corrcoef(diff_rating_complete,cos_dist_complete)
corr[0,1]

# <h2> Finding Pearson Correlation Between Rd and Cosine Distances (ONLY Successful Attempts) </h2>

In [None]:
df = temp[['tactics_problem_id','Group','rd']]
df = df[temp['Group'].isin(group_large_member_sorted)]

id_to_rd= {}
r,c = df.shape
for i in range(r):
    id_to_rd[df.iloc[i,0]] = df.iloc[i,2]

In [None]:
diff_rd_avg = []
cos_dist_avg = []
diff_rd_complete = []
cos_dist_complete = []
for group in new_group_to_id_dup:
    problem_list = new_group_to_id_dup[group]
    diff = 0
    count = 0
    cos_dist = 0
    for i in range(len(problem_list)):
        for j in range(i+1, len(problem_list)):
            diff += abs(id_to_rd[problem_list[i]]-id_to_rd[problem_list[j]])
            cos_dist += table.loc[problem_list[i], problem_list[j]]
            diff_rd_complete.append(abs(id_to_rd[problem_list[i]]-id_to_rd[problem_list[j]]))
            cos_dist_complete.append(table.loc[problem_list[i], problem_list[j]])
            count += 1
    diff_rd_avg.append(diff/count)
    cos_dist_avg.append(cos_dist/count)

In [None]:
corr = np.corrcoef(diff_rd_avg,cos_dist_avg)
corr[0,1]

In [None]:
corr = np.corrcoef(diff_rd_complete,cos_dist_complete)
corr[0,1]

# <h2> Finding Pearson Correlation Between Attempt Count and Cosine Distances (ONLY Successful Attempts) </h2>

In [None]:
df = temp[['tactics_problem_id','Group','attempt_count']]
df = df[temp['Group'].isin(group_large_member_sorted)]

id_to_attempt= {}
r,c = df.shape
for i in range(r):
    id_to_attempt[df.iloc[i,0]] = df.iloc[i,2]

In [None]:
diff_attempt_avg = []
cos_dist_avg = []
diff_attempt_complete = []
cos_dist_complete = []
for group in new_group_to_id_dup:
    problem_list = new_group_to_id_dup[group]
    diff = 0
    count = 0
    cos_dist = 0
    for i in range(len(problem_list)):
        for j in range(i+1, len(problem_list)):
            diff += abs(id_to_attempt[problem_list[i]]-id_to_attempt[problem_list[j]])
            cos_dist += table.loc[problem_list[i], problem_list[j]]
            diff_attempt_complete.append(abs(id_to_attempt[problem_list[i]]-id_to_attempt[problem_list[j]]))
            cos_dist_complete.append(table.loc[problem_list[i], problem_list[j]])
            count += 1
    diff_attempt_avg.append(diff/count)
    cos_dist_avg.append(cos_dist/count)

In [None]:
corr = np.corrcoef(diff_attempt_avg,cos_dist_avg)
corr[0,1]

In [None]:
corr = np.corrcoef(diff_attempt_complete,cos_dist_complete)
corr[0,1]

# <h2> Finding Pearson Correlation Between Time and Cosine Distances (ONLY Successful Attempts) </h2>

In [None]:
df = temp[['tactics_problem_id','Group','average_seconds']]
df = df[temp['Group'].isin(group_large_member_sorted)]

id_to_seconds= {}
r,c = df.shape
for i in range(r):
    id_to_seconds[df.iloc[i,0]] = df.iloc[i,2]

In [None]:
diff_seconds_avg = []
cos_dist_avg = []
diff_seconds_complete = []
cos_dist_complete = []
for group in new_group_to_id_dup:
    problem_list = new_group_to_id_dup[group]
    diff = 0
    count = 0
    cos_dist = 0
    for i in range(len(problem_list)):
        for j in range(i+1, len(problem_list)):
            diff += abs(id_to_seconds[problem_list[i]]-id_to_seconds[problem_list[j]])
            cos_dist += table.loc[problem_list[i], problem_list[j]]
            diff_seconds_complete.append(abs(id_to_seconds[problem_list[i]]-id_to_seconds[problem_list[j]]))
            cos_dist_complete.append(table.loc[problem_list[i], problem_list[j]])
            count += 1
    diff_seconds_avg.append(diff/count)
    cos_dist_avg.append(cos_dist/count)

In [None]:
corr = np.corrcoef(diff_seconds_avg,cos_dist_avg)
corr[0,1]

In [None]:
corr = np.corrcoef(diff_seconds_complete,cos_dist_complete)
corr[0,1]

# <h2> Finding Pearson Correlation Between Standard Deviation and Cosine Distances (ALL attempts) </h2>

In [None]:
#Load Data
chess_data = pd.read_csv('/w/225/1/chess/tactics/glicko_user_tactics_problem.csv_00')

In [None]:
chess_data = chess_data[chess_data['tactics_problem_id'].isin(problem_id_dup)]

In [None]:
chess_data

In [None]:
#Create vocabulary file for word2vecf
vocab_dat = chess_data.groupby('tactics_problem_id')['user_hash'].nunique().to_frame()

#Get the count of puzzles attempted per each user, ie create the context file for word2vecf
context_dat = chess_data.groupby('user_hash')['tactics_problem_id'].nunique().to_frame()

#Write to CSV the training, vocab and context data
#Convert indexes to columns
vocab_dat.reset_index(level=0, inplace=True)
vocab_dat.to_csv('vocab_dat_new',
                 sep=' ',
                 index=False,
                 header=False,
                 encoding='utf-8')

context_dat.reset_index(level=0,inplace=True)
context_dat.to_csv('context_dat_new',
                 sep=' ',
                 index=False,
                 header=False,
                 encoding='utf-8')

#Filter and reorder columns
chess_data = chess_data[['tactics_problem_id','user_hash']]
chess_data.to_csv('chess_data_new',
                 sep=' ',
                 index=False,
                 header=False,
                 encoding='utf-8')

In [None]:
context_dat = pd.read_csv("context_dat_new")
vocab_dat = pd.read_csv("vocab_dat_new")
chess_data = pd.read_csv("chess_data_new")

In [None]:
#Create a bunch of random pairs of 
pairs_to_sample = 1000000 # will be less b.c. duplicates
vocab_dat = vocab_dat.set_index('tactics_problem_id')
pairs = np.random.choice(vocab_dat.index, (pairs_to_sample,2))
pairs = pairs[pairs[:, 0] != pairs[:, 1]]
print(pairs[:5])
print(pairs.shape)

In [None]:
#Join pairs with their user counts for each puzzle
pmi_table = pd.DataFrame(data=pairs,columns=["puz_a","puz_b"])
pmi_table = pd.merge(pmi_table,vocab_dat,how='left',left_on='puz_a',right_on='tactics_problem_id')
#mi_table.columns = ['puz_a','puz_b','users_a',]
pmi_table = pd.merge(pmi_table,vocab_dat,how='left',left_on='puz_b',right_on='tactics_problem_id')
pmi_table.columns = ['puz_a','puz_b','users_a','users_b']
pmi_table.head()

In [None]:
#Sort for speeding up finding counts of shared users later
chess_data.sort_values(by="tactics_problem_id",inplace=True)
chess_data.head()

In [None]:
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

#Calculate number of users that have completed both puzzles for each pair
import time
def get_shared_user(row):
    #temp = chess_data[(chess_data['tactics_problem_id'] == row['puz_a']) | (chess_data['tactics_problem_id'] == row['puz_b'])]
    #return(len(temp[temp.duplicated(['user_hash'])]))
    #Option B, slightly slower
    #user_a = chess_data[chess_data['tactics_problem_id'] == row['puz_a']]
    #user_b = chess_data[chess_data['tactics_problem_id'] == row['puz_b']]
    #return(len(pd.merge(user_a,user_b,how='inner',on='user_hash')))
    #Option C, requires sorted list, fastest
    a_start = np.searchsorted(chess_data['tactics_problem_id'],row['puz_a'])
    #print(a_first_index)
    b_start = np.searchsorted(chess_data['tactics_problem_id'],row['puz_b'])
    user_a = chess_data.iloc[a_start:a_start+row['users_a']]
    user_b = chess_data.iloc[b_start:b_start+row['users_b']]
    return(len(pd.merge(user_a,user_b,how='inner',on='user_hash')))

start = time.time()
pmi_table['users_a_b'] = pmi_table.progress_apply(lambda row: get_shared_user(row),axis=1)
end = time.time()
print(end-start)

In [None]:
#Calculate pointwise mutual information for each pair
def calc_pmi(row,vocab_len):
    if(row['users_a_b'] == 0): return 0
    return np.log(row['users_a_b']) + vocab_len - np.log(row['users_a']) - np.log(row['users_b'])

vocab_len = np.log(len(vocab_dat.index))
pmi_table['pmi'] = pmi_table.apply(lambda row: calc_pmi(row,vocab_len),axis=1)
#Filter out negative values to get Postivive PMI
#pmi_table = pmi_table[pmi_table['pmi'] > 0]
pmi_table.head()

In [None]:
pmi = np.array(pmi_table['pmi'])
for val in pmi:
    print(val)

In [None]:
#Save PPMI
pmi_table.to_csv('ppmi_new', encoding='utf-8')

# <h2> Date, Event, Site </h2>

In [None]:
#These are the groups that we need to investigate further based on the previous result
groups_to_investigate = [7,145,163,196,206] 

In [None]:
new_data = temp[['Group','Date','Event','Site']]
new_data = new_data[new_data['Group'].isin(groups_to_investigate)]
new_data

In [None]:
x = temp[temp['Group'].isin([7,145,163])]
x = x[['tactics_problem_id', 'Group', 'tags']]
x

# <h2> MRR for Exact Duplicates </h2>

In [None]:
#temp = pd.read_csv('embeddings/250_0.16_vectors.tsv', sep=' ', header=None, skiprows=1)
temp = pd.read_csv("/h/224/stevenhl/chess_dim_skill/src/word2vecf/vectors/size_alpha/vecs_0.16_265.txt", sep=' ', header=None, skiprows=1)
temp = temp.drop(len(temp.columns)-1,axis=1)
problem_id = list(temp[0])
temp.rename(columns={0:'puzzle_id'}, inplace=True)
puzzle_id = list(temp['puzzle_id'])
temp.set_index('puzzle_id',inplace=True)
temp = temp.divide(np.linalg.norm(temp, axis=1), axis=0)

In [None]:
#Sample tags and generate distance matrix of those tags
import sklearn.metrics
def generate_matrix(sampled_problems):
    vecs = temp.loc[sampled_problems]
    cos_sim = sklearn.metrics.pairwise.cosine_similarity(vecs)
    cos_dist = 1.0 - cos_sim
    #Set really small numbers to 0
    cos_dist=cos_dist.round(3)
    #Create cosine distance table
    table = pd.DataFrame(cos_dist, columns=sampled_problems,index=sampled_problems)
    return table

In [None]:
table = generate_matrix(puzzle_id)

In [None]:
#Create a new table which is a modified version of table. The difference lies in the columns, where each column is
#the IDs in duplicate_data_id above. Basically, new_table is a "subset" of table. Note that the rows are still exactly
#the same as above since otherwise, the ranking won't be accurate.

new_table = pd.DataFrame(table, columns = target_problems)
new_table

In [None]:
r,c = new_table.shape

#temp is a matrix which is similar to new_table, except that each entry is a tuple of distance and its corresponding
#problem ID. The ordering for the tuple is (distance, problem_ID).
temp = np.empty([r,c], dtype = object)

for i in tqdm(range(r)):
    for j in range(c):
        temp[i][j] = (new_table.iloc[i,j], problem_id[i])

In [None]:
temp1 = np.sort(temp, axis = 0)

In [None]:
r,c = new_table.shape
reciprocal_rank_fen_full = []
rank_fen_full = []

for i in tqdm(range(len(duplicate_fen_full_id_to_id))):
    current_id = duplicate_fen_full_id_to_id[i][0] #get the ID on the left as the current
    target_id = duplicate_fen_full_id_to_id[i][1] #get the ID on the right as the target
    pos = target_problems.index(current_id) #find the column position
    if temp1[0][pos][0] != 'nan': #if a column consists of nan, then it's invalid
        for j in range(r):
            if target_id == temp1[j][pos][1]:
                rank_fen_full.append(j+1)
                reciprocal_rank_fen_full.append(1./(j+1))

In [None]:
mean_reciprocal_rank_fen_full = sum(reciprocal_rank_fen_full) / len(reciprocal_rank_fen_full)
mean_reciprocal_rank_fen_full

In [None]:
import matplotlib.pyplot as plt
plt.hist(reciprocal_rank_fen_full, bins = 100)

In [None]:
plt.hist(rank_fen_full, bins = 1000)