In [1]:
import requests
import json
import time
import os
import shutil
import zipfile
import pandas as pd
import warnings
warnings.filterwarnings('error')

`MAX_NPS` and `MAX_BPM` filter out the challenge maps which spam notes and thus should have an abnormally high nps or bpm. `MIN_NPS` filters out the slow accuracy maps that may have strange patterns only hittable at low speeds.
`MIN_DURATION_SECONDS` and `MAX_DURATION_SECONDS` are to filter out the short maps, low effort maps or the extremely long movie soundtracks that occasionally pop up.
 `MIN_RATING` and `MIN_UPVOTES` are to make sure the maps are actually well liked by the players.


In [2]:
START_DATE = '2021-01-01'
END_DATE = '2023-10-01'
MAX_NPS = 13
MIN_NPS = 4
MAX_BPM = 350
MIN_DURATION_SECONDS = 60
MAX_DURATION_SECONDS = 360
MIN_RATING = 0.7
MIN_UPVOTES = 40

Functions to search through and select high quality maps

In [125]:
# set max_requests to -1 to search until all maps are exhausted
def generateDataset(max_requests=1):
    end = END_DATE
    num_requests = 0
    pruned_maps = []
    while num_requests != max_requests:
        maps_result = requests.get('https://api.beatsaver.com/search/text/0?from=' + START_DATE + '&maxBpm=' + str(MAX_BPM) + '&maxDuration=' + str(MAX_DURATION_SECONDS) + '&maxNps=' + str(MAX_NPS) + '&minDuration=' + str(MIN_DURATION_SECONDS) + '&minNps=' + str(MIN_NPS) + '&minRating=' + str(MIN_RATING) + '&noodle=false&sortOrder=Latest&to=' + end)
        num_requests += 1
        maps_json = json.loads(maps_result.text)
        pruned_maps += pruneMaps(maps_json['docs'])
        end = maps_json['docs'][-1]['createdAt']

        # Beatsaver sends maps in batches of 20
        if len(maps_json['docs']) < 20:
            break
        print(len(pruned_maps))
    return pruned_maps

def pruneMaps(maps_json):
    pruned_maps = []
    for map in maps_json:
        # Min Upvotes check
        if map['stats']['upvotes'] < MIN_UPVOTES:
            continue

        beatleader_info = json.loads(requests.get('https://api.beatleader.xyz/leaderboards/hash/'+map['versions'][0]['hash']).text)
        time.sleep(0.1) # Precaution to not exceed maximum requests/second

        # Check that any Standard difficulty does not require Mapping Extensions, Noodle Extensions, or V3 notes.
        for diff in beatleader_info['song']['difficulties']:
            if diff['mode'] == 1 and diff['requirements'] & 0b101100 != 0:
                break
        else:
            pruned_maps.append([map['id'], map['versions'][0]['downloadURL']])
    return pruned_maps

Generating dataset text file

In [None]:
maps = generateDataset(max_requests=-1)
print(len(maps))

In [127]:
dataset_txt = open('dataset_ids_urls.txt', 'w')
dataset_txt.write(json.dumps(maps, indent=4))
dataset_txt.close()

Functions to download all Standard mode .dat files into `directory`

In [12]:
directory = 'BeatSaberMapsDataset'

In [14]:
def downloadDataset(map_ids_urls):
    # Initializing directories and log
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.mkdir(directory)
    os.mkdir(directory+'\\dlfolder')
    for map_id, url in map_ids_urls:
        deleteMap()
        downloaded = downloadMap(url)
        time.sleep(0.1)
        if not downloaded:
            continue
        with open(directory+'\\dlfolder\\currentMap\\info.dat', 'r', encoding='utf-8') as info:
            infoJSON = json.load(info)
            songBPM = infoJSON['_beatsPerMinute']
        print('Extracting map: ', map_id)
        extractStandardDiffstoCSV(map_id, songBPM)
        deleteMap()
    shutil.rmtree(directory+'\\dlfolder')

def deleteMap():
    if os.path.exists(directory+'dlFolder\\zipped'):
        os.remove(directory+'dlFolder\\zipped')
    if os.path.exists(directory+'dlFolder\\currentMap'):
        shutil.rmtree(directory+'dlFolder\\currentMap')

def downloadMap(url):
    r = requests.get(url, allow_redirects=True)
    
    with open(directory+'\\dlfolder\\zipped', 'wb') as f:
        f.write(r.content)
    
    try:
        with zipfile.ZipFile(directory+'\\dlFolder\\zipped', 'r') as zip_ref:
            zip_ref.extractall(directory+'\\dlFolder\\currentMap')
            zip_ref.close()
    except:
        return False
    return True

def extractStandardDiffstoCSV(map_id, songBPM):
    leaderboardID_dict = {
        'EasyStandard' : '11',
        'NormalStandard' : '31',
        'HardStandard' : '51',
        'ExpertStandard' : '71',
        'ExpertPlusStandard' : '91'
    }
    for key, value in leaderboardID_dict.items():
        if os.path.exists(directory+'\\dlfolder\\currentMap\\'+key+'.dat'):
            try:
                with open(directory+'\\dlfolder\\currentMap\\'+key+'.dat', 'r', encoding='utf-8') as difficulty_dat:
                    difficultyJSON = json.load(difficulty_dat)
                    
                    version = 0
                    if 'version' in difficultyJSON:
                        version = 3
                    elif '_version' in difficultyJSON:
                        version = 2
                    
                    if version == 2:
                        # Didn't think I'd need this but some bozo had a lightshow map mislabeled
                        if len(difficultyJSON['_notes']) <= 0:
                            continue
                        
                        note_data = pd.DataFrame.from_dict(difficultyJSON['_notes'])
                        try:
                            if '_customData' in note_data.columns:
                                note_data = note_data.drop('_customData', axis=1)
                            note_data['_timeInSeconds'] = note_data['_time'].apply(lambda x: x*60/songBPM)
                            note_data.to_csv(directory+'\\'+map_id+value+'.csv', mode='w')
                        except:
                            print('Bad CustomData in a v2 file')
                            continue
                    # V3 difficulty files store bombs and notes separately
                    # Also standardize the V3 formats to have the same labels as V2 in the dataset
                    elif version == 3:
                        
                        # Didn't think I'd need this but some bozo had a lightshow map mislabeled
                        if len(difficultyJSON['colorNotes']) <= 0:
                            continue
                        
                        notes = pd.DataFrame.from_dict(difficultyJSON['colorNotes'])
                        notes = notes.drop('a', axis=1)
                        if len(difficultyJSON['bombNotes']) > 0:
                            bombs = pd.DataFrame.from_dict(difficultyJSON['bombNotes'])
                            bombs['c'] = 3
                            bombs['d'] = 8
                            note_data = pd.merge(notes, bombs, how="outer", on=['b', 'x', 'y', 'c', 'd'])
                            note_data = note_data.sort_values(by=['b'])
                            note_data = note_data.reset_index(drop='True')
                        else:
                            note_data = notes
                        note_data = note_data.rename(columns={'b':'_time', 'x':'_lineIndex', 'y':'_lineLayer', 'c':'_type', 'd':'cutDirection'})
                        note_data['_timeInSeconds'] = note_data['_time'].apply(lambda x: x*60/songBPM)
                        note_data.to_csv(directory+'\\'+map_id+value+'.csv', mode='w')
            
            # yes this is awful code I just didn't want to iterate on a 3 hour problem so I manually deleted any file that threw an error of any kind
            except Exception as e:
                f = open('log.txt', 'a')
                f.write(str(e)+' ')
                f.write(map_id+value+'\n')
                f.close()
                continue

Downloading dataset

In [9]:
dataset_txt = open('dataset_ids_urls.txt', 'r')
maps = json.load(dataset_txt)
dataset_txt.close()

In [None]:
print(len(maps))

In [None]:
downloadDataset(maps)