In [2]:
import requests
import json
import time
import os
import shutil
import zipfile

`MAX_NPS` and `MAX_BPM` filter out the challenge maps which spam notes and thus should have an abnormally high nps or bpm. `MIN_NPS` filters out the slow accuracy maps that may have strange patterns only hittable at low speeds.
`MIN_DURATION_SECONDS` and `MAX_DURATION_SECONDS` are to filter out the short maps, low effort maps or the extremely long movie soundtracks that occasionally pop up.
 `MIN_RATING` and `MIN_UPVOTES` are to make sure the maps are actually well liked by the players.


In [3]:
START_DATE = '2021-01-01'
END_DATE = '2023-09-01'
MAX_NPS = 13
MIN_NPS = 4
MAX_BPM = 350
MIN_DURATION_SECONDS = 60
MAX_DURATION_SECONDS = 360
MIN_RATING = 0.7
MIN_UPVOTES = 40

Functions to search through and select high quality maps

In [18]:
# set max_requests to -1 to search until all maps are exhausted
def generateDataset(max_requests=1):
    end = END_DATE
    num_requests = 0
    pruned_maps = []
    while num_requests != max_requests:
        maps_result = requests.get('https://api.beatsaver.com/search/text/0?from=' + START_DATE + '&maxBpm=' + str(MAX_BPM) + '&maxDuration=' + str(MAX_DURATION_SECONDS) + '&maxNps=' + str(MAX_NPS) + '&minDuration=' + str(MIN_DURATION_SECONDS) + '&minNps=' + str(MIN_NPS) + '&minRating=' + str(MIN_RATING) + '&noodle=false&sortOrder=Latest&to=' + end)
        num_requests += 1
        maps_json = json.loads(maps_result.text)
        pruned_maps += pruneMaps(maps_json['docs'])
        end = maps_json['docs'][-1]['createdAt']

        # Beatsaver sends maps in batches of 20
        if len(maps_json['docs']) < 20:
            break
    return pruned_maps

def pruneMaps(maps_json):
    pruned_maps = []
    for map in maps_json:
        # Min Upvotes check
        if map['stats']['upvotes'] < MIN_UPVOTES:
            continue

        beatleader_info = json.loads(requests.get('https://api.beatleader.xyz/leaderboards/hash/'+map['versions'][0]['hash']).text)
        time.sleep(0.1) # Precaution to not exceed maximum requests/second

        # Check that any Standard difficulty does not require Mapping Extensions, Noodle Extensions, or V3 notes.
        for diff in beatleader_info['song']['difficulties']:
            if diff['mode'] == 1 and diff['requirements'] & 0b101100 != 0:
                break
        else:
            pruned_maps.append([map['id'], map['versions'][0]['downloadURL']])
    return pruned_maps

Generating dataset text file

In [None]:
maps = generateDataset(max_requests=-1)
print(len(maps))

In [19]:
dataset_txt = open('dataset_ids_urls.txt', 'w')
dataset_txt.write(json.dumps(maps, indent=4))
dataset_txt.close()

Functions to download all Standard mode .dat files into `directory`

In [5]:
directory = 'BeatSaberMapsDataset'

In [12]:
def downloadDataset(map_ids_urls):

    # Initializing directories and log
    if not os.path.exists(directory):
        os.mkdir(directory)
    if not os.path.exists(directory+'\\dlfolder'):
        os.mkdir(directory+'\\dlfolder')
    log = open(directory+'\\log.txt','w')
    for map_id, url in map_ids_urls:
        deleteMap()
        downloaded = downloadMap(url)
        time.sleep(0.1)
        if not downloaded:
            log.write('Map '+map_id+' failed to download\n')
            continue
        extractStandardDiffs(map_id)
    shutil.rmtree(directory+'\\dlfolder')
    log.close()

def deleteMap():
    if os.path.exists(directory+'dlFolder\\zipped'):
        os.remove(directory+'dlFolder\\zipped')
    if os.path.exists(directory+'dlFolder\\currentMap'):
        shutil.rmtree(directory+'dlFolder\\currentMap')

def downloadMap(url):
    r = requests.get(url, allow_redirects=True)
    open(directory+'\\dlfolder\\zipped', 'wb').write(r.content)
    try:
        with zipfile.ZipFile(directory+'\\dlFolder\\zipped', 'r') as zip_ref:
            zip_ref.extractall(directory+'\\dlFolder\\currentMap')
            zip_ref.close()
    except:
        return False
    return True

def extractStandardDiffs(map_id):
    leaderboardID_dict = {
        'EasyStandard' : '11',
        'NormalStandard' : '31',
        'HardStandard' : '51',
        'ExpertStandard' : '71',
        'ExpertPlusStandard' : '91'
    }

    for key, value in leaderboardID_dict.items():
        if os.path.exists(directory+'\\dlfolder\\currentMap\\'+key+'.dat'):
            try:
                shutil.move(directory+'\\dlfolder\\currentMap\\'+key+'.dat', directory+'\\'+map_id+value+'.dat')
            except:
                continue


Downloading dataset

In [7]:
dataset_txt = open('dataset_ids_urls.txt', 'r')
maps = json.load(dataset_txt)
dataset_txt.close()

In [11]:
downloadDataset(maps)