In [1]:
# Import libraries
import numpy as np
import heapq
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [2]:
# Constants
NUM_TOP_MAPS = 1024         # Number of popular maps to select
NUM_TOP_PLAYERS = 1024      # Number of prolific players to select
REPRESENTATION_SIZE = 11    # Size of skill/difficulty embedding
NUM_EPOCHS = 1000           # Number of epochs to train for
BATCH_SIZE = 10000          # Number of scores per batch

In [3]:
# Load top players and maps
scores = pd.read_csv('AllScoresExtracted.csv')
scores = scores.query("Modifiers.isnull()").copy(deep=True)
top_players = scores['PlayerId'].value_counts().iloc[:NUM_TOP_PLAYERS].index.tolist()
scores = scores.query('PlayerId in @top_players').copy(deep=True)
top_maps = scores['LeaderboardId'].value_counts().iloc[:NUM_TOP_MAPS].index.tolist()
scores = scores.query('LeaderboardId in @top_maps').copy(deep=True)
scores['LeaderboardId'], maps = pd.factorize(scores['LeaderboardId'])
scores['PlayerId'], users = pd.factorize(scores['PlayerId'])

In [4]:
# Select PlayerId, LeaderboardId as inputs, Accuracy as target
X_maps = []
X_users = []
y = []
for index, row in scores.iterrows():
    X_maps.append(row['LeaderboardId'])
    X_users.append(row['PlayerId'])
    y.append(row['Accuracy'])

In [5]:
# Convert to one-hot encoding
X_maps = tf.one_hot(X_maps, depth=NUM_TOP_MAPS)
X_users = tf.one_hot(X_users, depth=NUM_TOP_PLAYERS)
y = np.array(y)

In [6]:
# Define the model architecture
map_input = layers.Input(shape=(NUM_TOP_MAPS,))
map_embedding = layers.Dense(REPRESENTATION_SIZE)(map_input)
map_model = keras.Model(inputs=map_input, outputs=map_embedding)
map_model.compile(
    loss="mae",         # Use mse as the loss function
    optimizer="adam",   # Use adam as the optimizer
    metrics=["mae"]     # Use mae as the metric
)

map_input_2 = layers.Input(shape=(NUM_TOP_MAPS,))
map_embedding_2 = map_model(map_input_2)

user_input = layers.Input(shape=(NUM_TOP_PLAYERS,))
user_embedding = layers.Dense(REPRESENTATION_SIZE)(user_input)

output = layers.Dot(axes=1, normalize=False)([map_embedding_2, user_embedding])
output = layers.Dense(1, activation="linear")(output)
model = keras.Model(inputs=[map_input_2, user_input], outputs=output)

In [7]:
# Compile the model with loss and metrics
model.compile(
    loss="mae",         # Use mse as the loss function
    optimizer="adam",   # Use adam as the optimizer
    metrics=["mae"]     # Use mae as the metric
)

In [8]:
# # Train the model
# model.fit([X_maps, X_users], y, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)

In [9]:
# # Evaluate the model
# model.evaluate([X_maps, X_users], y, batch_size=BATCH_SIZE)[-1]

In [10]:
# # Save models
# model.save('saved_model')
# map_model.save('saved_map_model')

In [11]:
# Load models
model = tf.keras.models.load_model('saved_model')
map_model = tf.keras.models.load_model('saved_map_model')

In [12]:
# Evaluate map ranks
test_maps = tf.one_hot(range(NUM_TOP_MAPS), depth=NUM_TOP_MAPS)
test_ranks = map_model.predict(test_maps, batch_size=BATCH_SIZE)



In [13]:
# Normalize map ranks
test_ranks = [rank / np.linalg.norm(rank) for rank in test_ranks]

In [100]:
# Compute similarity
target = '184df71'
target_rank = test_ranks[list(maps).index(target)]

results = []
for i in range(len(maps)):
    map = maps[i]
    rank = test_ranks[i]
    results.append((map, np.dot(rank, target_rank)))
    # heapq.heappush(results, (np.dot(rank, target_rank), map))

results.sort(key=lambda el: el[1])

print('Target Map: https://www.beatleader.xyz/leaderboard/global/' + target)

print('\nMost Similar Maps:')
for i in range(2, 7):
    print(results[-i][1], ':', 'https://www.beatleader.xyz/leaderboard/global/' + results[-i][0])

print('\nLeast Similar Maps:')
for i in range(5):
    print(results[i][1], ':', 'https://www.beatleader.xyz/leaderboard/global/' + results[i][0])

print('\n')

Target Map: https://www.beatleader.xyz/leaderboard/global/184df71

Most Similar Maps:
0.99911207 : https://www.beatleader.xyz/leaderboard/global/1a5e471
0.99836046 : https://www.beatleader.xyz/leaderboard/global/15bd251
0.9983109 : https://www.beatleader.xyz/leaderboard/global/1ac0e51
0.99803126 : https://www.beatleader.xyz/leaderboard/global/1fc0891
0.9979923 : https://www.beatleader.xyz/leaderboard/global/12ab331

Least Similar Maps:
0.47196716 : https://www.beatleader.xyz/leaderboard/global/c32d91
0.48130992 : https://www.beatleader.xyz/leaderboard/global/2a25bx91
0.4830824 : https://www.beatleader.xyz/leaderboard/global/1385271
0.5226438 : https://www.beatleader.xyz/leaderboard/global/1e9991
0.5297132 : https://www.beatleader.xyz/leaderboard/global/1cc0f91




In [91]:
# Count most and least similar
# target = '11b4991'
# target_rank = test_ranks[list(maps).index(target)]

most_sim_counts = {}
least_sim_counts = {}

for id in maps:
    most_sim_counts[id] = 0
    least_sim_counts[id] = 0

for t_rank in test_ranks:
    results = []
    for i in range(len(maps)):
        map = maps[i]
        rank = test_ranks[i]
        results.append((map, np.dot(rank, t_rank)))

    results.sort(key=lambda el: el[1])

    for i in range(2, 7):
        most_sim_counts[results[-i][0]] += 1
    for i in range(5):
        least_sim_counts[results[i][0]] += 1

most_sim = sorted(most_sim_counts.items(), key=lambda el: el[1])
least_sim = sorted(least_sim_counts.items(), key=lambda el: el[1])

print('Most Similar Appearances:')
for i in range(1, 6):
    print(most_sim[-i][0] + ': ' + str(most_sim[-i][1]))

print()

print('Least Similar Appearances:')
for i in range(1, 6):
    print(least_sim[-i][0] + ': ' + str(least_sim[-i][1]))


Most Similar Appearances:
2185bxxxx71: 21
168de71: 18
1703f71: 18
1a32291: 17
20ad7x71: 16

Least Similar Appearances:
2a25bx91: 903
1385271: 900
c32d91: 786
1e9991: 572
d00c91: 521


In [121]:
# # Get mappers
# import requests
# import json

# def get_mapper(lid: str):
#     response = requests.get(f'https://api.beatleader.xyz/leaderboard/{lid}', {'accept': 'text/plain'})
#     return response.json()['song']['mapper']

# with_mappers = pd.DataFrame({'LeaderboardId': maps})
# with_mappers['mapper'] = with_mappers['LeaderboardId'].apply(get_mapper)

# with_mappers.to_csv('LID_and_mappers.csv', index=False)  # Save this so we don't have to spam requests
with_mappers = pd.read_csv('LID_and_mappers.csv')
with_mappers['test_ranks'] = test_ranks
with_mappers_no_collab = with_mappers[~with_mappers['mapper'].str.contains('(\&|\,|\svs\.)', regex=True)]   # Remove collab maps
with_mappers_no_collab


  with_mappers_no_collab = with_mappers[~with_mappers['mapper'].str.contains('(\&|\,|\svs\.)', regex=True)]   # Remove collab maps


Unnamed: 0,LeaderboardId,mapper,test_ranks
0,16e9791,altrewin,"[0.33214077, -0.3331743, -0.50072366, -0.22835..."
1,12bd251,cerret,"[0.23852026, -0.28436553, -0.22374626, -0.2110..."
2,d4a571,ETAN,"[0.20471643, -0.2148198, -0.27308458, -0.36335..."
3,ba4071,Rogdude,"[0.24801052, -0.286663, -0.2376006, -0.3112239..."
4,1821871,Timbo,"[0.38546827, -0.29871312, -0.26552692, -0.3438..."
...,...,...,...
1016,1fb19xxxxx91,Irish,"[0.4120143, -0.27789205, -0.38699707, -0.35438..."
1018,2e4bfx91,cerret,"[0.43736592, -0.36273715, -0.38078386, -0.1856..."
1019,28523xxxxxx91,Jabob,"[0.0322152, -0.34822726, -0.45311615, 0.018742..."
1021,1fd62xxx51,ani,"[0.20302303, -0.37476328, -0.19414923, -0.3580..."


In [122]:
# Significance test of random maps vs. same mapper
import random
from scipy.stats import ttest_ind

NUM_SAMPLES = 1000

pool_dist = []
mapper_dist = []

mappers = with_mappers_no_collab['mapper'].unique()

for i in range(NUM_SAMPLES):
    pool_sample = random.sample(test_ranks, 2)
    pool_dist.append(np.dot(pool_sample[0], pool_sample[1]))

    while True:
        # Ensure that the randomly chosen mapper has created >= 2 maps
        mapper_choice = random.choice(mappers)
        test_ranks_by_mapper = with_mappers_no_collab[with_mappers_no_collab['mapper'] == mapper_choice]['test_ranks'].tolist()
        if len(test_ranks_by_mapper) >= 2:
            break

    mapper_sample = random.sample(test_ranks_by_mapper, 2)
    mapper_dist.append(np.dot(mapper_sample[0], mapper_sample[1]))
    
ttest_result = ttest_ind(pool_dist, mapper_dist)

print(f'Mean dot product of 2 random maps: {np.mean(pool_dist)}')
print(f'SD of ^: {np.std(pool_dist)}')
print(f'Mean dot product of 2 maps from the same mapper: {np.mean(mapper_dist)}')
print(f'SD of ^: {np.std(mapper_dist)}')
print(f'Sample size: {NUM_SAMPLES}')
print(f't-value: {ttest_result.statistic}')
print(f'p-value: {ttest_result.pvalue}')

Mean dot product of 2 random maps: 0.8845562934875488
SD of ^: 0.09477957338094711
Mean dot product of 2 maps from the same mapper: 0.9043101072311401
SD of ^: 0.08980649709701538
Sample size: 1000
t-value: -4.781808801049192
p-value: 1.8641881363583812e-06


In [1]:
test_ranks

NameError: name 'test_ranks' is not defined